diff --git a/docs/source/_static/css/code-snippets.css b/docs/source/_static/css/code-snippets.css
index 43acc6751c..ccb0702008 100644
--- a/docs/source/_static/css/code-snippets.css
+++ b/docs/source/_static/css/code-snippets.css
@@ -9,4 +9,8 @@
 
 .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow {
     color: #6670FF;
+}
+
+.highlight .gp {
+    color: #FB8D68;
 }
\ No newline at end of file
diff --git a/docs/source/_static/css/huggingface.css b/docs/source/_static/css/huggingface.css
index eeced355df..c6ad7b7d5c 100644
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -44,6 +44,7 @@
     display: flex;
     flex-direction: row;
     justify-content: flex-end;
+    margin-right: 30px;
 }
 
 .framework-selector > button {
@@ -60,6 +61,12 @@
     padding: 5px;
 }
 
+/* Copy button */
+
+a.copybtn {
+    margin: 3px;
+}
+
 /* The literal code blocks */
 .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
     color: #6670FF;
diff --git a/docs/source/_static/js/custom.js b/docs/source/_static/js/custom.js
index 7aceddbe21..908497877c 100644
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -157,6 +157,8 @@ function platformToggle() {
     const codeBlocks = Array.from(document.getElementsByClassName("highlight"));
     const pytorchIdentifier = "## PYTORCH CODE";
     const tensorflowIdentifier = "## TENSORFLOW CODE";
+
+    const promptSpanIdentifier = `<span class="gp">&gt;&gt;&gt; </span>`
     const pytorchSpanIdentifier = `<span class="c1">${pytorchIdentifier}</span>`;
     const tensorflowSpanIdentifier = `<span class="c1">${tensorflowIdentifier}</span>`;
 
@@ -169,10 +171,22 @@ function platformToggle() {
         let tensorflowSpans;
 
         if(pytorchSpanPosition < tensorflowSpanPosition){
-            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, tensorflowSpanPosition);
+            const isPrompt = spans.slice(
+                spans.indexOf(tensorflowSpanIdentifier) - promptSpanIdentifier.length,
+                spans.indexOf(tensorflowSpanIdentifier)
+            ) == promptSpanIdentifier;
+            const finalTensorflowSpanPosition = isPrompt ? tensorflowSpanPosition - promptSpanIdentifier.length : tensorflowSpanPosition;
+
+            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, finalTensorflowSpanPosition);
             tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, spans.length);
         }else{
-            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, pytorchSpanPosition);
+            const isPrompt = spans.slice(
+                spans.indexOf(pytorchSpanIdentifier) - promptSpanIdentifier.length,
+                spans.indexOf(pytorchSpanIdentifier)
+            ) == promptSpanIdentifier;
+            const finalPytorchSpanPosition = isPrompt ? pytorchSpanPosition - promptSpanIdentifier.length : pytorchSpanPosition;
+
+            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, finalPytorchSpanPosition);
             pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, spans.length);
         }
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 0a73c2f49d..e39c2ad8cc 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -44,7 +44,8 @@ extensions = [
     'sphinx.ext.napoleon',
     'recommonmark',
     'sphinx.ext.viewcode',
-    'sphinx_markdown_tables'
+    'sphinx_markdown_tables',
+    'sphinx_copybutton'
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -74,6 +75,8 @@ exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = None
 
+# Remove the prompt when copying examples
+copybutton_prompt_text = ">>> "
 
 # -- Options for HTML output -------------------------------------------------
 
diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst
index 63e5eadefc..79d554b43d 100644
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -45,17 +45,16 @@ tokenizer, which is a `WordPiece <https://arxiv.org/pdf/1609.08144.pdf>`__ token
 
 ::
 
-    from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    >>> from transformers import BertTokenizer
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
 
-    sequence = "A Titan RTX has 24GB of VRAM"
+    >>> sequence = "A Titan RTX has 24GB of VRAM"
 
 The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.
 
 ::
 
-    tokenized_sequence = tokenizer.tokenize(sequence)
-    print(tokenized_sequence)
+    >>> tokenized_sequence = tokenizer.tokenize(sequence)
 
 The tokens are either words or subwords. Here for instance, "VRAM" wasn't in the model vocabulary, so it's been split
 in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-dash is
@@ -63,6 +62,7 @@ added for "RA" and "M":
 
 ::
 
+    >>> print(tokenized_sequence)
     ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
 
 These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
@@ -71,14 +71,14 @@ the sentence to the tokenizer, which leverages the Rust implementation of
 
 ::
 
-    encoded_sequence = tokenizer(sequence)["input_ids"]
-    print(encoded_sequence)
+    >>> encoded_sequence = tokenizer(sequence)["input_ids"]
 
 The tokenizer returns a dictionary with all the arguments necessary for its corresponding model to work properly. The
 token indices are under the key "input_ids":
 
 ::
 
+    >>> print(encoded_sequence)
     [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
 
 Note that the tokenizer automatically adds "special tokens" (if the associated model rely on them) which are special
@@ -86,13 +86,14 @@ IDs the model sometimes uses. If we decode the previous sequence of ids,
 
 ::
 
-    tokenizer.decode(encoded_sequence)
+    >>> decoded_sequence = tokenizer.decode(encoded_sequence)
 
 we will see 
 
 ::
 
-    '[CLS] A Titan RTX has 24GB of VRAM [SEP]'
+    >>> print(decoded_sequence)
+    [CLS] A Titan RTX has 24GB of VRAM [SEP]
 
 because this is the way a :class:`~transformers.BertModel` is going to expect its inputs.
 
@@ -108,21 +109,20 @@ For example, consider these two sequences:
 
 ::
 
-    from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    >>> from transformers import BertTokenizer
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
 
-    sequence_a = "This is a short sequence."
-    sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
+    >>> sequence_a = "This is a short sequence."
+    >>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
 
-    encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
-    encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
-    
-    len(encoded_sequence_a), len(encoded_sequence_b)
+    >>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
+    >>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
 
 The encoded versions have different lengths:
 
 ::
 
+    >>> len(encoded_sequence_a), len(encoded_sequence_b)
     (8, 19)
 
 Therefore, we can't be put then together in a same tensor as-is. The first sequence needs to be padded up to the length
@@ -133,15 +133,14 @@ it to pad like this:
 
 ::
 
-    padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
-    padded_sequences["input_ids"]
+    >>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
 
 We can see that 0s have been added on the right of the first sentence to make it the same length as the second one:
 
 ::
 
-    [[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-     [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
+    >>> padded_sequences["input_ids"]
+    [[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
 
 This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating
 the position of the padded indices so that the model does not attend to them. For the
@@ -150,14 +149,8 @@ a padded value. This attention mask is in the dictionary returned by the tokeniz
 
 ::
 
-    padded_sequences["attention_mask"]
-
-will give back
-
-::
-
-    [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+    >>> padded_sequences["attention_mask"]
+    [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
 
 .. _token-type-ids:
 
@@ -170,26 +163,27 @@ tokens. For example, the BERT model builds its two sequence input as such:
 
 ::
 
-   # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
+   >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
 
 We can use our tokenizer to automatically generate such a sentence by passing the two sequences as two arguments (and
 not a list like before) like this:
 
 ::
 
-    from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-    sequence_a = "HuggingFace is based in NYC"
-    sequence_b = "Where is HuggingFace based?"
+    >>> from transformers import BertTokenizer
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    >>> sequence_a = "HuggingFace is based in NYC"
+    >>> sequence_b = "Where is HuggingFace based?"
 
-    encoded_dict = tokenizer(sequence_a, sequence_b)
-    tokenizer.decode(encoded_dict["input_ids"])
+    >>> encoded_dict = tokenizer(sequence_a, sequence_b)
+    >>> decoded = tokenizer.decode(encoded_dict["input_ids"])
 
 which will return:
 
 ::
 
-    "[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]"
+    >>> print(decoded)
+    [CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
 
 This is enough for some models to understand where one sequence ends and where another begins. However, other models
 such as BERT have an additional mechanism, which are the token type IDs (also called segment IDs). They are a binary
@@ -199,12 +193,7 @@ The tokenizer returns in the dictionary under the key "token_type_ids":
 
 ::
 
-    encoded_dict['token_type_ids']
-
-will return
-
-::
-
+    >>> encoded_dict['token_type_ids']
     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 
 The first sequence, the "context" used for the question, has all its tokens represented by :obj:`0`, whereas the
diff --git a/docs/source/multilingual.rst b/docs/source/multilingual.rst
index 455df2dcb4..c35b01da3b 100644
--- a/docs/source/multilingual.rst
+++ b/docs/source/multilingual.rst
@@ -36,10 +36,11 @@ Here is an example using the ``xlm-clm-enfr-1024`` checkpoint (Causal language m
 
 .. code-block::
 
-    import torch
-    from transformers import XLMTokenizer, XLMWithLMHeadModel
+    >>> import torch
+    >>> from transformers import XLMTokenizer, XLMWithLMHeadModel
 
-    tokenizer = XLMTokenizer.from_pretrained("xlm-clm-1024-enfr")
+    >>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
+    >>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
 
 
 The different languages this model/tokenizer handles, as well as the ids of these languages are visible using the
@@ -47,16 +48,15 @@ The different languages this model/tokenizer handles, as well as the ids of thes
 
 .. code-block::
 
-    # Continuation of the previous script
-    print(tokenizer.lang2id)  # {'en': 0, 'fr': 1}
+    >>> print(tokenizer.lang2id)
+    {'en': 0, 'fr': 1}
 
 
 These ids should be used when passing a language parameter during a model pass. Let's define our inputs:
 
 .. code-block::
 
-    # Continuation of the previous script
-    input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1
+    >>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1
 
 
 We should now define the language embedding by using the previously defined language id. We want to create a tensor
@@ -64,20 +64,18 @@ filled with the appropriate language ids, of the same size as input_ids. For eng
 
 .. code-block::
 
-    # Continuation of the previous script
-    language_id = tokenizer.lang2id['en']  # 0
-    langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
+    >>> language_id = tokenizer.lang2id['en']  # 0
+    >>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
 
-    # We reshape it to be of size (batch_size, sequence_length)
-    langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)
+    >>> # We reshape it to be of size (batch_size, sequence_length)
+    >>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)
 
 
 You can then feed it all as input to your model:
 
 .. code-block::
 
-    # Continuation of the previous script
-    outputs = model(input_ids, langs=langs)
+    >>> outputs = model(input_ids, langs=langs)
 
 
 The example `run_generation.py <https://github.com/huggingface/transformers/blob/master/examples/text-generation/run_generation.py>`__
diff --git a/docs/source/quicktour.rst b/docs/source/quicktour.rst
index 85c67c66cb..523ede72bf 100644
--- a/docs/source/quicktour.rst
+++ b/docs/source/quicktour.rst
@@ -1,378 +1,393 @@
-Quick tour
-==========
-
-Let's have a quick look at the 🤗 Transformers library features. The library downloads pretrained models for
-Natural Language Understanding (NLU) tasks, such as analyzing the sentiment of a text, and Natural Language Generation (NLG),
-such as completing a prompt with new text or translating in another language.
-
-First we will see how to easily leverage the pipeline API to quickly use those pretrained models at inference. Then, we
-will dig a little bit more and see how the library gives you access to those models and helps you preprocess your data.
-
-.. note::
-
-    All code examples presented in the documentation have a switch on the top left for Pytorch versus TensorFlow. If
-    not, the code is expected to work for both backends without any change needed.
-
-Getting started on a task with a pipeline
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The easiest way to use a pretrained model on a given task is to use :func:`~transformers.pipeline`. 🤗 Transformers
-provides the following tasks out of the box:
-
-- Sentiment analysis: is a text positive or negative?
-- Text generation (in English): provide a prompt and the model will generate what follows.
-- Name entity recognition (NER): in an input sentence, label each word with the entity it represents (person, place,
-  etc.)
-- Question answering: provide the model with some context and a question, extract the answer from the context.
-- Filling masked text: given a text with masked words (e.g., replaced by ``[MASK]``), fill the blanks.
-- Summarization: generate a summary of a long text.
-- Translation: translate a text in another language.
-- Feature extraction: return a tensor representation of the text.
-
-Let's see how this work for sentiment analysis (the other tasks are all covered in the
-:doc:`task summary </task_summary>`):
-
-::
-
-    from transformers import pipeline
-    classifier = pipeline('sentiment-analysis')
-
-When typing this command for the first time, a pretrained model and its tokenizer are downloaded and cached. We will
-look at both later on, but as an introduction the tokenizer's job is to preprocess the text for the model, which is
-then responsible for making predictions. The pipeline groups all of that together, and post-process the predictions to
-make them readable. For instance
-
-::
-
-    classifier('We are very happy to show you the 🤗 Transformers library.')
-
-will return something like this:
-
-::
-
-    [{'label': 'POSITIVE', 'score': 0.9997795224189758}]
-
-That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model as a
-`batch`:
-
-::
-
-    classifier(["We are very happy to show you the 🤗 Transformers library.",
-                "We hope you don't hate it."])
-
-returning a list of dictionaries like this one:
-
-::
-
-    [{'label': 'POSITIVE', 'score': 0.9997795224189758},
-     {'label': 'NEGATIVE', 'score': 0.5308589935302734}]
-
-You can see the second sentence has been classified as negative (it needs to be positive or negative) but its score is
-fairly neutral.
-
-By default, the model downloaded for this pipeline is called "distilbert-base-uncased-finetuned-sst-2-english". We can
-look at its `model page <https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english>`__ to get more
-information about it. It uses the :doc:`DistilBERT architecture </model_doc/distilbert>` and has been fine-tuned on a
-dataset called SST-2 for the sentiment analysis task.
-
-Let's say we want to use another model; for instance, one that has been trained on French data. We can search through
-the `model hub <https://huggingface.co/models>`__ that gathers models pretrained on a lot of data by research labs, but
-also community models (usually fine-tuned versions of those big models on a specific dataset). Applying the tags
-"French" and "text-classification" gives back a suggestion "nlptown/bert-base-multilingual-uncased-sentiment". Let's
-see how we can use it. 
-
-You can directly pass the name of the model to use to :func:`~transformers.pipeline`:
-
-::
-
-    classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")
-
-This classifier can now deal with texts in English, French, but also Dutch, German, Italian and Spanish! You can also
-replace that name by a local folder where you have saved a pretrained model (see below). You can also pass a model
-object and its associated tokenizer.
-
-We will need two classes for this. The first is :class:`~transformers.AutoTokenizer`, which we will use to download the
-tokenizer associated to the model we picked and instantiate it. The second is
-:class:`~transformers.AutoModelForSequenceClassification` (or
-:class:`~transformers.TFAutoModelForSequenceClassification` if you are using TensorFlow), which we will use to download
-the model itself. Note that if we were using the library on an other task, the class of the model would change. The
-:doc:`task summary </task_summary>` tutorial summarizes which class is used for which task.
-
-::
-
-    ## PYTORCH CODE
-    from transformers import AutoTokenizer, AutoModelForSequenceClassification
-    ## TENSORFLOW CODE
-    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-
-Now, to download the models and tokenizer we found previously, we just have to use the 
-:func:`~transformers.AutoModelForSequenceClassification.from_pretrained` method (feel free to replace ``model_name`` by
-any other model from the model hub):
-
-::
-
-    ## PYTORCH CODE
-    model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
-    model = AutoModelForSequenceClassification.from_pretrained(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    pipe = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
-    ## TENSORFLOW CODE
-    model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
-    model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
-
-If you don't find a model that has been pretrained on some data similar to yours, you will need to fine-tune a
-pretrained model on your data. We provide :doc:`example scripts </examples>` to do so. Once you're done, don't forget
-to share your fine-tuned model on the hub with the community, using :doc:`this tutorial </model_sharing>`.
-
-.. _pretrained-model:
-
-Under the hood: pretrained models
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Let's now see what happens beneath the hood when using those pipelines. As we saw, the model and tokenizer are created
-using the :obj:`from_pretrained` method:
-
-::
-
-    ## PYTORCH CODE
-    from transformers import AutoTokenizer, AutoModelForSequenceClassification
-    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
-    model = AutoModelForSequenceClassification.from_pretrained(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    ## TENSORFLOW CODE
-    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
-    model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-Using the tokenizer
-^^^^^^^^^^^^^^^^^^^
-
-We mentioned the tokenizer is responsible for the preprocessing of your texts. First, it will split a given text in
-words (or part of words, punctuation symbols, etc.) usually called `tokens`. There are multiple rules that can govern
-that process, which is why we need to instantiate the tokenizer using the name of the model, to make sure we use the
-same rules as when the model was pretrained.
-
-The second step is to convert those `tokens` into numbers, to be able to build a tensor out of them and feed them to
-the model. To do this, the tokenizer has a `vocab`, which is the part we download when we instantiate it with the
-:obj:`from_pretrained` method, since we need to use the same `vocab` as when the model was pretrained.
-
-To apply these steps on a given text, we can just feed it to our tokenizer:
-
-::
-
-    input = tokenizer("We are very happy to show you the 🤗 Transformers library.")
-    print(input)
-
-This returns a dictionary string to list of ints. It contains the `ids of the tokens <glossary.html#input-ids>`__,
-as mentioned before, but also additional arguments that will be useful to the model. Here for instance, we also have an
-`attention mask <glossary.html#attention-mask>`__ that the model will use to have a better understanding of the sequence:
-
-
-::
-    {'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102],
-     'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-
-You can pass a list of sentences directly to your tokenizer. If your goal is to send them through your model as a
-batch, you probably want to pad them all to the same length, truncate them to the maximum length the model can accept
-and get tensors back. You can specify all of that to the tokenizer:
-
-::
-
-    ## PYTORCH CODE
-    batch = tokenizer(
-        ["We are very happy to show you the 🤗 Transformers library.",
-         "We hope you don't hate it."],
-        padding=True, truncation=True, return_tensors="pt")
-    print(batch)
-    ## TENSORFLOW CODE
-    batch = tokenizer(
-        ["We are very happy to show you the 🤗 Transformers library.",
-         "We hope you don't hate it."],
-        padding=True, truncation=True, return_tensors="tf")
-    print(batch)
-
-The padding is automatically applied on the side the model expect it (in this case, on the right), with the
-padding token the model was pretrained with. The attention mask is also adapted to take the padding into account:
-
-::
-
-    {'input_ids': tensor([[  101,  2057,  2024,  2200,  3407,  2000,  2265,  2017,  1996,   100, 19081,  3075,  1012,   102],
-                          [  101,  2057,  3246,  2017,  2123,  1005,  1056,  5223,  2009,  1012,   102,     0,     0,     0]]), 
-     'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                               [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}
-
-You can learn more about tokenizers :doc:`here <preprocessing>`.
-
-Using the model
-^^^^^^^^^^^^^^^
-
-Once your input has been preprocessed by the tokenizer, you can directly send it to the model. As we mentioned, it will
-contain all the relevant information the model needs. If you're using a TensorFlow model, you can directly pass the
-dictionary keys to tensor, for a PyTorch model, you need to unpack the dictionary by adding :obj:`**`.
-
-::
-
-    ## PYTORCH CODE
-    outputs = model(**batch)
-    ## TENSORFLOW CODE
-    outputs = model(batch)
-
-In 🤗 Transformers, all outputs are tuples (with only one element potentially). Here, we get a tuple with just the
-final activations of the model.
-
-::
-
-    (tensor([[-4.1329,  4.3811],
-             [ 0.0818, -0.0418]]),)
-
-.. note::
-
-    All 🤗 Transformers models (PyTorch or TensorFlow) return the activations of the model *before* the final
-    activation function (like SoftMax) since this final activation function is often fused with the loss.
-
-Let's apply the SoftMax activation to get predictions.
-
-::
-
-    ## PYTORCH CODE
-    import torch.nn.functional as F
-    predictions = F.softmax(outputs[0], dim=-1)
-    print(predictions)
-    ## TENSORFLOW CODE
-    predictions = tf.nn.softmax(outputs[0], axis=-1)
-    print(predictions)
-
-We can see we get the numbers from before:
-
-::
-
-    tensor([[2.0060e-04, 9.9980e-01],
-            [5.3086e-01, 4.6914e-01]])
-
-If you have labels, you can provide them to the model, it will return a tuple with the loss and the final activations.
-
-::
-
-    ## PYTORCH CODE
-    import torch
-    outputs = model(**batch, labels = torch.tensor([1, 0])
-    ## TENSORFLOW CODE
-    import tensorflow as tf
-    outputs = model(batch, labels = tf.constant([1, 0])
-
-Models are standard `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ or
-`tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ so you can use them in your usual
-training loop. 🤗 Transformers also provides a :class:`~transformers.Trainer` (or :class:`~transformers.TFTrainer` if
-you are using TensorFlow) class to help with your training (taking care of things such as distributed training, mixed
-precision, etc.). See the training tutorial (coming soon) for more details.
-
-Once your model is fine-tuned, you can save it with its tokenizer the following way:
-
-::
-
-    tokenizer.save_pretrained(save_directory)
-    model.save_pretrained(save_directory)
-
-You can then load this model back using the :func:`~transformers.AutoModel.from_pretrained` method by passing the
-directory name instead of the model name. One cool feature of 🤗 Transformers is that you can easily switch between
-PyTorch and TensorFlow: any model saved as before can be loaded back either in PyTorch or TensorFlow. If you are
-loading a saved PyTorch model in a TensorFlow model, use :func:`~transformers.TFAutoModel.from_pretrained` like this:
-
-::
-
-    tokenizer = AutoTokenizer.from_pretrained(save_directory)
-    model = TFAutoModel.from_pretrained(save_directory, from_pt=True)
-
-and if you are loading a saved TensorFlow model in a PyTorch model, you should use the following code:
-
-::
-
-    tokenizer = AutoTokenizer.from_pretrained(save_directory)
-    model = AutoModel.from_pretrained(save_directory, from_tf=True)
-
-Lastly, you can also ask the model to return all hidden states and all attention weights if you need them:
-
-
-::
-
-    ## PYTORCH CODE
-    outputs = model(**batch, output_hidden_states=True, output_attentions=True)
-    all_hidden_states, all_attentions = outputs[-2:]
-    ## TENSORFLOW CODE
-    outputs = model(batch, output_hidden_states=True, output_attentions=True)
-    all_hidden_states, all_attentions = outputs[-2:]
-
-Accessing the code
-^^^^^^^^^^^^^^^^^^
-
-The :obj:`AutoModel` and :obj:`AutoTokenizer` classes are just shortcuts that will automatically work with any
-pretrained model. Behind the scenes, the library has one model class per combination of architecture plus class, so the
-code is easy to access and tweak if you need to.
-
-In our previous example, the model was called "distilbert-base-uncased-finetuned-sst-2-english", which means it's
-using the :doc:`DistilBERT </model_doc/distilbert>` architecture. The model automatically created is then a
-:class:`~transformers.DistilBertForSequenceClassification`. You can look at its documentation for all details relevant
-to that specific model, or browse the source code. This is how you would directly instantiate model and tokenizer
-without the auto magic:
-
-::
-
-    ## PYTORCH CODE
-    from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
-    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
-    model = DistilBertForSequenceClassification.from_pretrained(model_name)
-    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
-    ## TENSORFLOW CODE
-    from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
-    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
-    model = TFDistilBertForSequenceClassification.from_pretrained(model_name)
-    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
-
-Customizing the model
-^^^^^^^^^^^^^^^^^^^^^
-
-If you want to change how the model itself is built, you can define your custom configuration class. Each architecture
-comes with its own relevant configuration (in the case of DistilBERT, :class:`~transformers.DistilBertConfig`) which
-allows you to specify any of the hidden dimension, dropout rate etc. If you do core modifications, like changing the
-hidden size, you won't be able to use a pretrained model anymore and will need to train from scratch. You would then
-instantiate the model directly from this configuration.
-
-Here we use the predefined vocabulary of DistilBERT (hence load the tokenizer with the
-:func:`~transformers.DistilBertTokenizer.from_pretrained` method) and initialize the model from scratch (hence
-instantiate the model from the configuration instead of using the
-:func:`~transformers.DistilBertForSequenceClassification.from_pretrained` method).
-
-::
-
-    ## PYTORCH CODE
-    from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
-    config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
-    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-    model = DistilBertForSequenceClassification(config)
-    ## TENSORFLOW CODE
-    from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
-    config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
-    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-    model = TFDistilBertForSequenceClassification(config)
-
-For something that only changes the head of the model (for instance, the number of labels), you can still use a
-pretrained model for the body. For instance, let's define a classifier for 10 different labels using a pretrained body.
-We could create a configuration with all the default values and just change the number of labels, but more easily, you
-can directly pass any argument a configuration would take to the :func:`from_pretrained` method and it will update the
-default configuration with it:
-
-::
-
-    ## PYTORCH CODE
-    from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
-    model_name = "distilbert-base-uncased"
-    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
-    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
-    ## TENSORFLOW CODE
-    from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
-    model_name = "distilbert-base-uncased"
-    model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
-    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
+Quick tour
+==========
+
+Let's have a quick look at the 🤗 Transformers library features. The library downloads pretrained models for
+Natural Language Understanding (NLU) tasks, such as analyzing the sentiment of a text, and Natural Language Generation (NLG),
+such as completing a prompt with new text or translating in another language.
+
+First we will see how to easily leverage the pipeline API to quickly use those pretrained models at inference. Then, we
+will dig a little bit more and see how the library gives you access to those models and helps you preprocess your data.
+
+.. note::
+
+    All code examples presented in the documentation have a switch on the top left for Pytorch versus TensorFlow. If
+    not, the code is expected to work for both backends without any change needed.
+
+Getting started on a task with a pipeline
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The easiest way to use a pretrained model on a given task is to use :func:`~transformers.pipeline`. 🤗 Transformers
+provides the following tasks out of the box:
+
+- Sentiment analysis: is a text positive or negative?
+- Text generation (in English): provide a prompt and the model will generate what follows.
+- Name entity recognition (NER): in an input sentence, label each word with the entity it represents (person, place,
+  etc.)
+- Question answering: provide the model with some context and a question, extract the answer from the context.
+- Filling masked text: given a text with masked words (e.g., replaced by ``[MASK]``), fill the blanks.
+- Summarization: generate a summary of a long text.
+- Translation: translate a text in another language.
+- Feature extraction: return a tensor representation of the text.
+
+Let's see how this work for sentiment analysis (the other tasks are all covered in the
+:doc:`task summary </task_summary>`):
+
+.. code-block::
+
+    >>> from transformers import pipeline
+    >>> classifier = pipeline('sentiment-analysis')
+
+When typing this command for the first time, a pretrained model and its tokenizer are downloaded and cached. We will
+look at both later on, but as an introduction the tokenizer's job is to preprocess the text for the model, which is
+then responsible for making predictions. The pipeline groups all of that together, and post-process the predictions to
+make them readable. For instance:
+
+
+.. code-block::
+
+    >>> classifier('We are very happy to show you the 🤗 Transformers library.')
+    [{'label': 'POSITIVE', 'score': 0.9997795224189758}]
+
+That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model as a
+`batch`, returning a list of dictionaries like this one:
+
+.. code-block::
+
+    >>> results = classifier(["We are very happy to show you the 🤗 Transformers library.",
+    ...            "We hope you don't hate it."])
+    >>> for result in results:
+    ...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+    label: POSITIVE, with score: 0.9998
+    label: NEGATIVE, with score: 0.5309
+
+You can see the second sentence has been classified as negative (it needs to be positive or negative) but its score is
+fairly neutral.
+
+By default, the model downloaded for this pipeline is called "distilbert-base-uncased-finetuned-sst-2-english". We can
+look at its `model page <https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english>`__ to get more
+information about it. It uses the :doc:`DistilBERT architecture </model_doc/distilbert>` and has been fine-tuned on a
+dataset called SST-2 for the sentiment analysis task.
+
+Let's say we want to use another model; for instance, one that has been trained on French data. We can search through
+the `model hub <https://huggingface.co/models>`__ that gathers models pretrained on a lot of data by research labs, but
+also community models (usually fine-tuned versions of those big models on a specific dataset). Applying the tags
+"French" and "text-classification" gives back a suggestion "nlptown/bert-base-multilingual-uncased-sentiment". Let's
+see how we can use it.
+
+You can directly pass the name of the model to use to :func:`~transformers.pipeline`:
+
+.. code-block::
+
+    >>> classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")
+
+This classifier can now deal with texts in English, French, but also Dutch, German, Italian and Spanish! You can also
+replace that name by a local folder where you have saved a pretrained model (see below). You can also pass a model
+object and its associated tokenizer.
+
+We will need two classes for this. The first is :class:`~transformers.AutoTokenizer`, which we will use to download the
+tokenizer associated to the model we picked and instantiate it. The second is
+:class:`~transformers.AutoModelForSequenceClassification` (or
+:class:`~transformers.TFAutoModelForSequenceClassification` if you are using TensorFlow), which we will use to download
+the model itself. Note that if we were using the library on an other task, the class of the model would change. The
+:doc:`task summary </task_summary>` tutorial summarizes which class is used for which task.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+Now, to download the models and tokenizer we found previously, we just have to use the
+:func:`~transformers.AutoModelForSequenceClassification.from_pretrained` method (feel free to replace ``model_name`` by
+any other model from the model hub):
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+    >>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+    >>> pipe = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
+    >>> ## TENSORFLOW CODE
+    >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+    >>> # This model only exists in PyTorch, so we use the `from_pt` flag to import that model in TensorFlow.
+    >>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True) 
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+    >>> classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
+
+If you don't find a model that has been pretrained on some data similar to yours, you will need to fine-tune a
+pretrained model on your data. We provide :doc:`example scripts </examples>` to do so. Once you're done, don't forget
+to share your fine-tuned model on the hub with the community, using :doc:`this tutorial </model_sharing>`.
+
+.. _pretrained-model:
+
+Under the hood: pretrained models
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Let's now see what happens beneath the hood when using those pipelines. As we saw, the model and tokenizer are created
+using the :obj:`from_pretrained` method:
+
+::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    >>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+    >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    >>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+Using the tokenizer
+^^^^^^^^^^^^^^^^^^^
+
+We mentioned the tokenizer is responsible for the preprocessing of your texts. First, it will split a given text in
+words (or part of words, punctuation symbols, etc.) usually called `tokens`. There are multiple rules that can govern
+that process, which is why we need to instantiate the tokenizer using the name of the model, to make sure we use the
+same rules as when the model was pretrained.
+
+The second step is to convert those `tokens` into numbers, to be able to build a tensor out of them and feed them to
+the model. To do this, the tokenizer has a `vocab`, which is the part we download when we instantiate it with the
+:obj:`from_pretrained` method, since we need to use the same `vocab` as when the model was pretrained.
+
+To apply these steps on a given text, we can just feed it to our tokenizer:
+
+.. code-block::
+
+    >>> inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")
+
+This returns a dictionary string to list of ints. It contains the `ids of the tokens <glossary.html#input-ids>`__,
+as mentioned before, but also additional arguments that will be useful to the model. Here for instance, we also have an
+`attention mask <glossary.html#attention-mask>`__ that the model will use to have a better understanding of the sequence:
+
+
+.. code-block::
+
+    >>> print(inputs)
+    {'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+
+You can pass a list of sentences directly to your tokenizer. If your goal is to send them through your model as a
+batch, you probably want to pad them all to the same length, truncate them to the maximum length the model can accept
+and get tensors back. You can specify all of that to the tokenizer:
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> pt_batch = tokenizer(
+    ...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+    ...     padding=True,
+    ...     truncation=True,
+    ...     return_tensors="pt"
+    ... )
+    >>> ## TENSORFLOW CODE
+    >>> tf_batch = tokenizer(
+    ...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+    ...     padding=True,
+    ...     truncation=True,
+    ...     return_tensors="tf"
+    ... )
+
+The padding is automatically applied on the side the model expect it (in this case, on the right), with the
+padding token the model was pretrained with. The attention mask is also adapted to take the padding into account:
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> for key, value in pt_batch.items():
+    ...     print(f"{key}: {value.numpy().tolist()}")
+    input_ids: [[101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], [101, 2057, 3246, 2017, 2123, 1005, 1056, 5223, 2009, 1012, 102, 0, 0, 0]]
+    attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]
+    >>> ## TENSORFLOW CODE
+    >>> for key, value in tf_batch.items():
+    ...     print(f"{key}: {value.numpy().tolist()}")
+    input_ids: [[101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], [101, 2057, 3246, 2017, 2123, 1005, 1056, 5223, 2009, 1012, 102, 0, 0, 0]]
+    attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]
+
+You can learn more about tokenizers :doc:`here <preprocessing>`.
+
+Using the model
+^^^^^^^^^^^^^^^
+
+Once your input has been preprocessed by the tokenizer, you can directly send it to the model. As we mentioned, it will
+contain all the relevant information the model needs. If you're using a TensorFlow model, you can directly pass the
+dictionary keys to tensor, for a PyTorch model, you need to unpack the dictionary by adding :obj:`**`.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> pt_outputs = pt_model(**pt_batch)
+    >>> ## TENSORFLOW CODE
+    >>> tf_outputs = tf_model(tf_batch)
+
+In 🤗 Transformers, all outputs are tuples (with only one element potentially). Here, we get a tuple with just the
+final activations of the model.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> print(pt_outputs)
+    (tensor([[-4.0833,  4.3364],
+            [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>),)
+    >>> ## TENSORFLOW CODE
+    >>> print(tf_outputs)
+    (<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
+    array([[-4.0832963 ,  4.3364134 ],
+           [ 0.08181238, -0.04178794]], dtype=float32)>,)
+
+.. note::
+
+    All 🤗 Transformers models (PyTorch or TensorFlow) return the activations of the model *before* the final
+    activation function (like SoftMax) since this final activation function is often fused with the loss.
+
+Let's apply the SoftMax activation to get predictions.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> import torch.nn.functional as F
+    >>> pt_predictions = F.softmax(pt_outputs[0], dim=-1)
+    >>> ## TENSORFLOW CODE
+    >>> import tensorflow as tf
+    >>> tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
+
+We can see we get the numbers from before:
+
+.. code-block::
+
+    >>> ## TENSORFLOW CODE
+    >>> print(tf_predictions)
+    tf.Tensor(
+    [[2.2042994e-04 9.9977952e-01]
+     [5.3086078e-01 4.6913919e-01]], shape=(2, 2), dtype=float32)
+    >>> ## PYTORCH CODE
+    >>> print(pt_predictions)
+    tensor([[2.2043e-04, 9.9978e-01],
+            [5.3086e-01, 4.6914e-01]], grad_fn=<SoftmaxBackward>)
+
+If you have labels, you can provide them to the model, it will return a tuple with the loss and the final activations.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> import torch
+    >>> pt_outputs = pt_model(**pt_batch, labels = torch.tensor([1, 0]))
+    >>> ## TENSORFLOW CODE
+    >>> import tensorflow as tf
+    >>> tf_outputs = tf_model(tf_batch, labels = tf.constant([1, 0]))
+
+Models are standard `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ or
+`tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ so you can use them in your usual
+training loop. 🤗 Transformers also provides a :class:`~transformers.Trainer` (or :class:`~transformers.TFTrainer` if
+you are using TensorFlow) class to help with your training (taking care of things such as distributed training, mixed
+precision, etc.). See the training tutorial (coming soon) for more details.
+
+Once your model is fine-tuned, you can save it with its tokenizer the following way:
+
+::
+
+    tokenizer.save_pretrained(save_directory)
+    model.save_pretrained(save_directory)
+
+You can then load this model back using the :func:`~transformers.AutoModel.from_pretrained` method by passing the
+directory name instead of the model name. One cool feature of 🤗 Transformers is that you can easily switch between
+PyTorch and TensorFlow: any model saved as before can be loaded back either in PyTorch or TensorFlow. If you are
+loading a saved PyTorch model in a TensorFlow model, use :func:`~transformers.TFAutoModel.from_pretrained` like this:
+
+::
+
+    tokenizer = AutoTokenizer.from_pretrained(save_directory)
+    model = TFAutoModel.from_pretrained(save_directory, from_pt=True)
+
+and if you are loading a saved TensorFlow model in a PyTorch model, you should use the following code:
+
+::
+
+    tokenizer = AutoTokenizer.from_pretrained(save_directory)
+    model = AutoModel.from_pretrained(save_directory, from_tf=True)
+
+Lastly, you can also ask the model to return all hidden states and all attention weights if you need them:
+
+
+::
+
+    >>> ## PYTORCH CODE
+    >>> pt_outputs = pt_model(**pt_batch, output_hidden_states=True, output_attentions=True)
+    >>> all_hidden_states, all_attentions = pt_outputs[-2:]
+    >>> ## TENSORFLOW CODE
+    >>> tf_outputs = tf_model(tf_batch, output_hidden_states=True, output_attentions=True)
+    >>> all_hidden_states, all_attentions = tf_outputs[-2:]
+
+Accessing the code
+^^^^^^^^^^^^^^^^^^
+
+The :obj:`AutoModel` and :obj:`AutoTokenizer` classes are just shortcuts that will automatically work with any
+pretrained model. Behind the scenes, the library has one model class per combination of architecture plus class, so the
+code is easy to access and tweak if you need to.
+
+In our previous example, the model was called "distilbert-base-uncased-finetuned-sst-2-english", which means it's
+using the :doc:`DistilBERT </model_doc/distilbert>` architecture. The model automatically created is then a
+:class:`~transformers.DistilBertForSequenceClassification`. You can look at its documentation for all details relevant
+to that specific model, or browse the source code. This is how you would directly instantiate model and tokenizer
+without the auto magic:
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
+    >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    >>> model = DistilBertForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
+    >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    >>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
+
+Customizing the model
+^^^^^^^^^^^^^^^^^^^^^
+
+If you want to change how the model itself is built, you can define your custom configuration class. Each architecture
+comes with its own relevant configuration (in the case of DistilBERT, :class:`~transformers.DistilBertConfig`) which
+allows you to specify any of the hidden dimension, dropout rate etc. If you do core modifications, like changing the
+hidden size, you won't be able to use a pretrained model anymore and will need to train from scratch. You would then
+instantiate the model directly from this configuration.
+
+Here we use the predefined vocabulary of DistilBERT (hence load the tokenizer with the
+:func:`~transformers.DistilBertTokenizer.from_pretrained` method) and initialize the model from scratch (hence
+instantiate the model from the configuration instead of using the
+:func:`~transformers.DistilBertForSequenceClassification.from_pretrained` method).
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
+    >>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+    >>> model = DistilBertForSequenceClassification(config)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
+    >>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+    >>> model = TFDistilBertForSequenceClassification(config)
+
+For something that only changes the head of the model (for instance, the number of labels), you can still use a
+pretrained model for the body. For instance, let's define a classifier for 10 different labels using a pretrained body.
+We could create a configuration with all the default values and just change the number of labels, but more easily, you
+can directly pass any argument a configuration would take to the :func:`from_pretrained` method and it will update the
+default configuration with it:
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
+    >>> model_name = "distilbert-base-uncased"
+    >>> model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
+    >>> model_name = "distilbert-base-uncased"
+    >>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
diff --git a/docs/source/task_summary.rst b/docs/source/task_summary.rst
index 712d24e8fb..d1157ccccb 100644
--- a/docs/source/task_summary.rst
+++ b/docs/source/task_summary.rst
@@ -50,21 +50,21 @@ a model on a GLUE sequence classification task, you may leverage the
 Here is an example using the pipelines do to sentiment analysis: identifying if a sequence is positive or negative.
 It leverages a fine-tuned model on sst2, which is a GLUE task.
 
-::
-
-    from transformers import pipeline
-
-    nlp = pipeline("sentiment-analysis")
-
-    print(nlp("I hate you"))
-    print(nlp("I love you"))
-
 This returns a label ("POSITIVE" or "NEGATIVE") alongside a score, as follows:
 
-::
+.. code-block::
 
-    [{'label': 'NEGATIVE', 'score': 0.9991129}]
-    [{'label': 'POSITIVE', 'score': 0.99986565}]
+    >>> from transformers import pipeline
+
+    >>> nlp = pipeline("sentiment-analysis")
+
+    >>> result = nlp("I hate you")[0]
+    >>> print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+    label: NEGATIVE, with score: 0.9991
+
+    >>> result = nlp("I love you")[0]
+    >>> print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+    label: POSITIVE, with score: 0.9999
 
 
 Here is an example of doing a sequence classification using a model to determine if two sequences are paraphrases
@@ -80,76 +80,72 @@ of each other. The process is the following:
 - Compute the softmax of the result to get probabilities over the classes
 - Print the results
 
-::
+.. code-block::
 
-    ## PYTORCH CODE
-    from transformers import AutoTokenizer, AutoModelForSequenceClassification
-    import torch
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    >>> import torch
 
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
-    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
+    >>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
 
-    classes = ["not paraphrase", "is paraphrase"]
+    >>> classes = ["not paraphrase", "is paraphrase"]
 
-    sequence_0 = "The company HuggingFace is based in New York City"
-    sequence_1 = "Apples are especially bad for your health"
-    sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
+    >>> sequence_0 = "The company HuggingFace is based in New York City"
+    >>> sequence_1 = "Apples are especially bad for your health"
+    >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
 
-    paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="pt")
-    not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="pt")
+    >>> paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="pt")
+    >>> not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="pt")
 
-    paraphrase_classification_logits = model(**paraphrase)[0]
-    not_paraphrase_classification_logits = model(**not_paraphrase)[0]
+    >>> paraphrase_classification_logits = model(**paraphrase)[0]
+    >>> not_paraphrase_classification_logits = model(**not_paraphrase)[0]
 
-    paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
-    not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
+    >>> paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
+    >>> not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
 
-    print("Should be paraphrase")
-    for i in range(len(classes)):
-        print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")
-
-    print("\nShould not be paraphrase")
-    for i in range(len(classes)):
-        print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%")
-    ## TENSORFLOW CODE
-    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-    import tensorflow as tf
-
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
-    model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
-
-    classes = ["not paraphrase", "is paraphrase"]
-
-    sequence_0 = "The company HuggingFace is based in New York City"
-    sequence_1 = "Apples are especially bad for your health"
-    sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
-
-    paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="tf")
-    not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="tf")
-
-    paraphrase_classification_logits = model(paraphrase)[0]
-    not_paraphrase_classification_logits = model(not_paraphrase)[0]
-
-    paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
-    not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
-
-    print("Should be paraphrase")
-    for i in range(len(classes)):
-        print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")
-
-    print("\nShould not be paraphrase")
-    for i in range(len(classes)):
-        print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%")
-
-This outputs the following results:
-
-::
-
-    Should be paraphrase
+    >>> # Should be paraphrase
+    >>> for i in range(len(classes)):
+    ...     print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
     not paraphrase: 10%
     is paraphrase: 90%
 
-    Should not be paraphrase
+    >>> # Should not be paraphrase
+    >>> for i in range(len(classes)):
+    ...     print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")
+    not paraphrase: 94%
+    is paraphrase: 6%
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
+    >>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
+
+    >>> classes = ["not paraphrase", "is paraphrase"]
+
+    >>> sequence_0 = "The company HuggingFace is based in New York City"
+    >>> sequence_1 = "Apples are especially bad for your health"
+    >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
+
+    >>> paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="tf")
+    >>> not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="tf")
+
+    >>> paraphrase_classification_logits = model(paraphrase)[0]
+    >>> not_paraphrase_classification_logits = model(not_paraphrase)[0]
+
+    >>> paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
+    >>> not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
+
+    >>> # Should be paraphrase
+    >>> for i in range(len(classes)):
+    ...     print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
+    not paraphrase: 10%
+    is paraphrase: 90%
+
+    >>> # Should not be paraphrase
+    >>> for i in range(len(classes)):
+    ...     print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")
     not paraphrase: 94%
     is paraphrase: 6%
 
@@ -163,28 +159,30 @@ a model on a SQuAD task, you may leverage the `run_squad.py`.
 Here is an example using the pipelines do to question answering: extracting an answer from a text given a question.
 It leverages a fine-tuned model on SQuAD.
 
-::
+.. code-block::
 
-    from transformers import pipeline
+    >>> from transformers import pipeline
 
-    nlp = pipeline("question-answering")
+    >>> nlp = pipeline("question-answering")
 
-    context = r"""
-    Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
-    question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
-    a model on a SQuAD task, you may leverage the `run_squad.py`.
-    """
-
-    print(nlp(question="What is extractive question answering?", context=context))
-    print(nlp(question="What is a good example of a question answering dataset?", context=context))
+    >>> context = r"""
+    ... Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
+    ... question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
+    ... a model on a SQuAD task, you may leverage the examples/question-answering/run_squad.py script.
+    ... """
 
 This returns an answer extracted from the text, a confidence score, alongside "start" and "end" values which
 are the positions of the extracted answer in the text.
 
-::
+.. code-block::
 
-    {'score': 0.622232091629833, 'start': 34, 'end': 96, 'answer': 'the task of extracting an answer from a text given a question.'}
-    {'score': 0.5115299158662765, 'start': 147, 'end': 161, 'answer': 'SQuAD dataset,'}
+    >>> result = nlp(question="What is extractive question answering?", context=context)
+    >>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
+    Answer: 'the task of extracting an answer from a text given a question.', score: 0.6226, start: 34, end: 96
+
+    >>> result = nlp(question="What is a good example of a question answering dataset?", context=context)
+    >>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
+    Answer: 'SQuAD dataset,', score: 0.5053, start: 147, end: 161
 
 
 Here is an example of question answering using a model and a tokenizer. The process is the following:
@@ -200,92 +198,91 @@ Here is an example of question answering using a model and a tokenizer. The proc
 - Fetch the tokens from the identified start and stop values, convert those tokens to a string.
 - Print the results
 
-::
+.. code-block::
 
-    ## PYTORCH CODE
-    from transformers import AutoTokenizer, AutoModelForQuestionAnswering
-    import torch
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+    >>> import torch
 
-    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
-    model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+    >>> model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
 
-    text = r"""
-    🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
-    architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
-    Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
-    TensorFlow 2.0 and PyTorch.
-    """
+    >>> text = r"""
+    ... 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
+    ... architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
+    ... Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+    ... TensorFlow 2.0 and PyTorch.
+    ... """
 
-    questions = [
-        "How many pretrained models are available in 🤗 Transformers?",
-        "What does 🤗 Transformers provide?",
-        "🤗 Transformers provides interoperability between which frameworks?",
-    ]
-
-    for question in questions:
-        inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
-        input_ids = inputs["input_ids"].tolist()[0]
-
-        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer_start_scores, answer_end_scores = model(**inputs)
-
-        answer_start = torch.argmax(
-            answer_start_scores
-        )  # Get the most likely beginning of answer with the argmax of the score
-        answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
-
-        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
-
-        print(f"Question: {question}")
-        print(f"Answer: {answer}\n")
-    ## TENSORFLOW CODE
-    from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
-    import tensorflow as tf
-
-    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
-    model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
-
-    text = r"""
-    🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
-    architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
-    Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
-    TensorFlow 2.0 and PyTorch.
-    """
-
-    questions = [
-        "How many pretrained models are available in 🤗 Transformers?",
-        "What does 🤗 Transformers provide?",
-        "🤗 Transformers provides interoperability between which frameworks?",
-    ]
-
-    for question in questions:
-        inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="tf")
-        input_ids = inputs["input_ids"].numpy()[0]
-
-        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer_start_scores, answer_end_scores = model(inputs)
-
-        answer_start = tf.argmax(
-            answer_start_scores, axis=1
-        ).numpy()[0]  # Get the most likely beginning of answer with the argmax of the score
-        answer_end = (
-            tf.argmax(answer_end_scores, axis=1) + 1
-        ).numpy()[0]  # Get the most likely end of answer with the argmax of the score
-        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
-
-        print(f"Question: {question}")
-        print(f"Answer: {answer}\n")
-
-This outputs the questions followed by the predicted answers:
-
-::
+    >>> questions = [
+    ...     "How many pretrained models are available in 🤗 Transformers?",
+    ...     "What does 🤗 Transformers provide?",
+    ...     "🤗 Transformers provides interoperability between which frameworks?",
+    ... ]
 
+    >>> for question in questions:
+    ...     inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
+    ...     input_ids = inputs["input_ids"].tolist()[0]
+    ...
+    ...     text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+    ...     answer_start_scores, answer_end_scores = model(**inputs)
+    ...
+    ...     answer_start = torch.argmax(
+    ...         answer_start_scores
+    ...     )  # Get the most likely beginning of answer with the argmax of the score
+    ...     answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
+    ...
+    ...     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
+    ...
+    ...     print(f"Question: {question}")
+    ...     print(f"Answer: {answer}")
     Question: How many pretrained models are available in 🤗 Transformers?
     Answer: over 32 +
-
     Question: What does 🤗 Transformers provide?
     Answer: general - purpose architectures
+    Question: 🤗 Transformers provides interoperability between which frameworks?
+    Answer: tensorflow 2 . 0 and pytorch
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
+    >>> import tensorflow as tf
 
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+    >>> model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+
+    >>> text = r"""
+    ... 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
+    ... architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
+    ... Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+    ... TensorFlow 2.0 and PyTorch.
+    ... """
+
+    >>> questions = [
+    ...     "How many pretrained models are available in 🤗 Transformers?",
+    ...     "What does 🤗 Transformers provide?",
+    ...     "🤗 Transformers provides interoperability between which frameworks?",
+    ... ]
+
+    >>> for question in questions:
+    ...     inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="tf")
+    ...     input_ids = inputs["input_ids"].numpy()[0]
+    ...
+    ...     text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+    ...     answer_start_scores, answer_end_scores = model(inputs)
+    ...
+    ...     answer_start = tf.argmax(
+    ...         answer_start_scores, axis=1
+    ...     ).numpy()[0]  # Get the most likely beginning of answer with the argmax of the score
+    ...     answer_end = (
+    ...         tf.argmax(answer_end_scores, axis=1) + 1
+    ...     ).numpy()[0]  # Get the most likely end of answer with the argmax of the score
+    ...     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
+    ...
+    ...     print(f"Question: {question}")
+    ...     print(f"Answer: {answer}")
+    Question: How many pretrained models are available in 🤗 Transformers?
+    Answer: over 32 +
+    Question: What does 🤗 Transformers provide?
+    Answer: general - purpose architectures
     Question: 🤗 Transformers provides interoperability between which frameworks?
     Answer: tensorflow 2 . 0 and pytorch
 
@@ -313,25 +310,44 @@ see `Lewis, Lui, Goyal et al. <https://arxiv.org/abs/1910.13461>`__, part 4.2).
 
 Here is an example of using pipelines to replace a mask from a sequence:
 
-::
+.. code-block::
 
-    from transformers import pipeline
+    >>> from transformers import pipeline
 
-    nlp = pipeline("fill-mask")
-    print(nlp(f"HuggingFace is creating a {nlp.tokenizer.mask_token} that the community uses to solve NLP tasks."))
+    >>> nlp = pipeline("fill-mask")
 
 This outputs the sequences with the mask filled, the confidence score as well as the token id in the tokenizer
 vocabulary:
 
-::
+.. code-block::
 
-    [
-        {'sequence': '<s> HuggingFace is creating a tool that the community uses to solve NLP tasks.</s>', 'score': 0.15627853572368622, 'token': 3944},
-        {'sequence': '<s> HuggingFace is creating a framework that the community uses to solve NLP tasks.</s>', 'score': 0.11690319329500198, 'token': 7208},
-        {'sequence': '<s> HuggingFace is creating a library that the community uses to solve NLP tasks.</s>', 'score': 0.058063216507434845, 'token': 5560},
-        {'sequence': '<s> HuggingFace is creating a database that the community uses to solve NLP tasks.</s>', 'score': 0.04211743175983429, 'token': 8503},
-        {'sequence': '<s> HuggingFace is creating a prototype that the community uses to solve NLP tasks.</s>', 'score': 0.024718601256608963, 'token': 17715}
-    ]
+    >>> from pprint import pprint
+    >>> pprint(nlp(f"HuggingFace is creating a {nlp.tokenizer.mask_token} that the community uses to solve NLP tasks."))
+    [{'score': 0.1792745739221573,
+      'sequence': '<s>HuggingFace is creating a tool that the community uses to '
+                  'solve NLP tasks.</s>',
+      'token': 3944,
+      'token_str': 'Ġtool'},
+     {'score': 0.11349421739578247,
+      'sequence': '<s>HuggingFace is creating a framework that the community uses '
+                  'to solve NLP tasks.</s>',
+      'token': 7208,
+      'token_str': 'Ġframework'},
+     {'score': 0.05243554711341858,
+      'sequence': '<s>HuggingFace is creating a library that the community uses to '
+                  'solve NLP tasks.</s>',
+      'token': 5560,
+      'token_str': 'Ġlibrary'},
+     {'score': 0.03493533283472061,
+      'sequence': '<s>HuggingFace is creating a database that the community uses '
+                  'to solve NLP tasks.</s>',
+      'token': 8503,
+      'token_str': 'Ġdatabase'},
+     {'score': 0.02860250137746334,
+      'sequence': '<s>HuggingFace is creating a prototype that the community uses '
+                  'to solve NLP tasks.</s>',
+      'token': 17715,
+      'token_str': 'Ġprototype'}]
 
 Here is an example doing masked language modeling using a model and a tokenizer. The process is the following:
 
@@ -345,51 +361,48 @@ Here is an example doing masked language modeling using a model and a tokenizer.
 - Retrieve the top 5 tokens using the PyTorch :obj:`topk` or TensorFlow :obj:`top_k` methods.
 - Replace the mask token by the tokens and print the results
 
-::
+.. code-block::
 
-    ## PYTORCH CODE
-    from transformers import AutoModelWithLMHead, AutoTokenizer
-    import torch
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+    >>> import torch
 
-    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
-    model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+    >>> model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")
 
-    sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+    >>> sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
 
-    input = tokenizer.encode(sequence, return_tensors="pt")
-    mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
+    >>> input = tokenizer.encode(sequence, return_tensors="pt")
+    >>> mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
 
-    token_logits = model(input)[0]
-    mask_token_logits = token_logits[0, mask_token_index, :]
+    >>> token_logits = model(input)[0]
+    >>> mask_token_logits = token_logits[0, mask_token_index, :]
 
-    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
+    >>> top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
+    >>> import tensorflow as tf
 
-    for token in top_5_tokens:
-        print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
-    ## TENSORFLOW CODE
-    from transformers import TFAutoModelWithLMHead, AutoTokenizer
-    import tensorflow as tf
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+    >>> model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased")
 
-    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
-    model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+    >>> sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
 
-    sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+    >>> input = tokenizer.encode(sequence, return_tensors="tf")
+    >>> mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1]
 
-    input = tokenizer.encode(sequence, return_tensors="tf")
-    mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1]
+    >>> token_logits = model(input)[0]
+    >>> mask_token_logits = token_logits[0, mask_token_index, :]
 
-    token_logits = model(input)[0]
-    mask_token_logits = token_logits[0, mask_token_index, :]
+    >>> top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
 
-    top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
-
-    for token in top_5_tokens:
-        print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
 
 This prints five sequences, with the top 5 tokens predicted by the model:
 
-::
+.. code-block::
 
+    >>> for token in top_5_tokens:
+    ...     print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
     Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint.
     Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint.
     Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
@@ -408,65 +421,63 @@ Usually, the next token is predicted by sampling from the logits of the last hid
 
 Here is an example using the tokenizer and model and leveraging the :func:`~transformers.PreTrainedModel.top_k_top_p_filtering` method to sample the next token following an input sequence of tokens.
 
-::
+.. code-block::
 
-    ## PYTORCH CODE
-    from transformers import AutoModelWithLMHead, AutoTokenizer, top_k_top_p_filtering
-    import torch
-    from torch.nn import functional as F
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelWithLMHead, AutoTokenizer, top_k_top_p_filtering
+    >>> import torch
+    >>> from torch.nn import functional as F
 
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> model = AutoModelWithLMHead.from_pretrained("gpt2")
 
-    tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    model = AutoModelWithLMHead.from_pretrained("gpt2")
+    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and "
 
-    sequence = f"Hugging Face is based in DUMBO, New York City, and "
+    >>> input_ids = tokenizer.encode(sequence, return_tensors="pt")
 
-    input_ids = tokenizer.encode(sequence, return_tensors="pt")
+    >>> # get logits of last hidden state
+    >>> next_token_logits = model(input_ids)[0][:, -1, :]
 
-    # get logits of last hidden state
-    next_token_logits = model(input_ids)[0][:, -1, :]
+    >>> # filter
+    >>> filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
 
-    # filter
-    filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
+    >>> # sample
+    >>> probs = F.softmax(filtered_next_token_logits, dim=-1)
+    >>> next_token = torch.multinomial(probs, num_samples=1)
 
-    # sample
-    probs = F.softmax(filtered_next_token_logits, dim=-1)
-    next_token = torch.multinomial(probs, num_samples=1)
+    >>> generated = torch.cat([input_ids, next_token], dim=-1)
 
-    generated = torch.cat([input_ids, next_token], dim=-1)
+    >>> resulting_string = tokenizer.decode(generated.tolist()[0])
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer, tf_top_k_top_p_filtering
+    >>> import tensorflow as tf
 
-    resulting_string = tokenizer.decode(generated.tolist()[0])
-    print(resulting_string)
-    ## TENSORFLOW CODE
-    from transformers import TFAutoModelWithLMHead, AutoTokenizer, tf_top_k_top_p_filtering
-    import tensorflow as tf
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> model = TFAutoModelWithLMHead.from_pretrained("gpt2")
 
-    tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    model = TFAutoModelWithLMHead.from_pretrained("gpt2")
+    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and "
 
-    sequence = f"Hugging Face is based in DUMBO, New York City, and "
+    >>> input_ids = tokenizer.encode(sequence, return_tensors="tf")
 
-    input_ids = tokenizer.encode(sequence, return_tensors="tf")
+    >>> # get logits of last hidden state
+    >>> next_token_logits = model(input_ids)[0][:, -1, :]
 
-    # get logits of last hidden state
-    next_token_logits = model(input_ids)[0][:, -1, :]
+    >>> # filter
+    >>> filtered_next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
 
-    # filter
-    filtered_next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
+    >>> # sample
+    >>> next_token = tf.random.categorical(filtered_next_token_logits, dtype=tf.int32, num_samples=1)
 
-    # sample
-    next_token = tf.random.categorical(filtered_next_token_logits, dtype=tf.int32, num_samples=1)
+    >>> generated = tf.concat([input_ids, next_token], axis=1)
 
-    generated = tf.concat([input_ids, next_token], axis=1)
-
-    resulting_string = tokenizer.decode(generated.numpy().tolist()[0])
-    print(resulting_string)
+    >>> resulting_string = tokenizer.decode(generated.numpy().tolist()[0])
 
 
 This outputs a (hopefully) coherent next token following the original sequence, which is in our case is the word *has*:
 
-::
+.. code-block::
 
+    print(resulting_string)
     Hugging Face is based in DUMBO, New York City, and has
 
 In the next section, we show how this functionality is leveraged in :func:`~transformers.PreTrainedModel.generate` to generate multiple tokens up to a user-defined length.
@@ -476,12 +487,14 @@ Text Generation
 
 In text generation (*a.k.a* *open-ended text generation*) the goal is to create a coherent portion of text that is a continuation from the given context. As an example, is it shown how *GPT-2* can be used in pipelines to generate text. As a default all models apply *Top-K* sampling when used in pipelines as configured in their respective configurations (see `gpt-2 config <https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json>`_ for example).
 
-::
+.. code-block::
 
-    from transformers import pipeline
+    >>> from transformers import pipeline
+
+    >>> text_generator = pipeline("text-generation")
+    >>> print(text_generator("As far as I am concerned, I will", max_length=50, do_sample=False))
+    [{'generated_text': 'As far as I am concerned, I will be the first to admit that I am not a fan of the idea of a "free market." I think that the idea of a free market is a bit of a stretch. I think that the idea'}]
 
-    text_generator = pipeline("text-generation")
-    print(text_generator("As far as I am concerned, I will", max_length=50))
 
 
 Here the model generates a random text with a total maximal length of *50* tokens from context *"As far as I am concerned, I will"*.
@@ -489,58 +502,59 @@ The default arguments of ``PreTrainedModel.generate()`` can directly be override
 
 Here is an example for text generation using XLNet and its tokenzier. 
 
-::
+.. code-block::
 
-    ## PYTORCH CODE
-    from transformers import AutoModelWithLMHead, AutoTokenizer
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
 
-    model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased")
-    tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
+    >>> model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased")
+    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
 
-    # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
-    PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
-    (except for Alexei and Maria) are discovered.
-    The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
-    remainder of the story. 1883 Western Siberia,
-    a young Grigori Rasputin is asked by his father and a group of men to perform magic.
-    Rasputin has a vision and denounces one of the men as a horse thief. Although his
-    father initially slaps him for making such an accusation, Rasputin watches as the
-    man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
-    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
-    with people, even a bishop, begging for his blessing. <eod> </s> <eos>""" 
+    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
+    >>> PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
+    ... (except for Alexei and Maria) are discovered.
+    ... The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+    ... remainder of the story. 1883 Western Siberia,
+    ... a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+    ... Rasputin has a vision and denounces one of the men as a horse thief. Although his
+    ... father initially slaps him for making such an accusation, Rasputin watches as the
+    ... man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    ... the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+    ... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
 
-    prompt = "Today the weather is really nice and I am planning on "
-    inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")
-    
-    prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
-    outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
-    generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
+    >>> prompt = "Today the weather is really nice and I am planning on "
+    >>> inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")
 
-    print(generated)
-    ## TENSORFLOW CODE
-    from transformers import TFAutoModelWithLMHead, AutoTokenizer
+    >>> prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
+    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
 
-    model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased")
-    tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
 
-    # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
-    PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
-    (except for Alexei and Maria) are discovered.
-    The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
-    remainder of the story. 1883 Western Siberia,
-    a young Grigori Rasputin is asked by his father and a group of men to perform magic.
-    Rasputin has a vision and denounces one of the men as a horse thief. Although his
-    father initially slaps him for making such an accusation, Rasputin watches as the
-    man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
-    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
-    with people, even a bishop, begging for his blessing. <eod> </s> <eos>""" 
+    >>> model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased")
+    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
 
-    prompt = "Today the weather is really nice and I am planning on "
-    inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf")
+    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
+    >>> PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
+    ... (except for Alexei and Maria) are discovered.
+    ... The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+    ... remainder of the story. 1883 Western Siberia,
+    ... a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+    ... Rasputin has a vision and denounces one of the men as a horse thief. Although his
+    ... father initially slaps him for making such an accusation, Rasputin watches as the
+    ... man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    ... the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+    ... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
 
-    prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
-    outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
-    generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
+    >>> prompt = "Today the weather is really nice and I am planning on "
+    >>> inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf")
+
+    >>> prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
+    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
+
+.. code-block::
 
     print(generated)
 
@@ -575,21 +589,22 @@ of 9 classes:
 It leverages a fine-tuned model on CoNLL-2003, fine-tuned by `@stefan-it <https://github.com/stefan-it>`__ from
 `dbmdz <https://github.com/dbmdz>`__.
 
-::
+.. code-block::
 
-    from transformers import pipeline
+    >>> from transformers import pipeline
 
-    nlp = pipeline("ner")
+    >>> nlp = pipeline("ner")
 
-    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
-               "close to the Manhattan Bridge which is visible from the window."
+    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very"
+    ...            "close to the Manhattan Bridge which is visible from the window."
 
-    print(nlp(sequence))
 
 This outputs a list of all words that have been identified as an entity from the 9 classes defined above. Here is the
 expected results:
 
-::
+.. code-block::
+
+    print(nlp(sequence))
 
     [
         {'word': 'Hu', 'score': 0.9995632767677307, 'entity': 'I-ORG'},
@@ -623,75 +638,73 @@ Here is an example doing named entity recognition using a model and a tokenizer.
   for each token.
 - Zip together each token with its prediction and print it.
 
-::
+.. code-block::
 
-    ## PYTORCH CODE
-    from transformers import AutoModelForTokenClassification, AutoTokenizer
-    import torch
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelForTokenClassification, AutoTokenizer
+    >>> import torch
 
-    model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+    >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
 
-    label_list = [
-        "O",       # Outside of a named entity
-        "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
-        "I-MISC",  # Miscellaneous entity
-        "B-PER",   # Beginning of a person's name right after another person's name
-        "I-PER",   # Person's name
-        "B-ORG",   # Beginning of an organisation right after another organisation
-        "I-ORG",   # Organisation
-        "B-LOC",   # Beginning of a location right after another location
-        "I-LOC"    # Location
-    ]
+    >>> label_list = [
+    ...     "O",       # Outside of a named entity
+    ...     "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
+    ...     "I-MISC",  # Miscellaneous entity
+    ...     "B-PER",   # Beginning of a person's name right after another person's name
+    ...     "I-PER",   # Person's name
+    ...     "B-ORG",   # Beginning of an organisation right after another organisation
+    ...     "I-ORG",   # Organisation
+    ...     "B-LOC",   # Beginning of a location right after another location
+    ...     "I-LOC"    # Location
+    ... ]
 
-    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
-               "close to the Manhattan Bridge."
+    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+    ...            "close to the Manhattan Bridge."
 
-    # Bit of a hack to get the tokens with the special tokens
-    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
-    inputs = tokenizer.encode(sequence, return_tensors="pt")
+    >>> # Bit of a hack to get the tokens with the special tokens
+    >>> tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
+    >>> inputs = tokenizer.encode(sequence, return_tensors="pt")
 
-    outputs = model(inputs)[0]
-    predictions = torch.argmax(outputs, dim=2)
+    >>> outputs = model(inputs)[0]
+    >>> predictions = torch.argmax(outputs, dim=2)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelForTokenClassification, AutoTokenizer
+    >>> import tensorflow as tf
 
-    print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].tolist())])
-    ## TENSORFLOW CODE
-    from transformers import TFAutoModelForTokenClassification, AutoTokenizer
-    import tensorflow as tf
+    >>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
 
-    model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+    >>> label_list = [
+    ...     "O",       # Outside of a named entity
+    ...     "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
+    ...     "I-MISC",  # Miscellaneous entity
+    ...     "B-PER",   # Beginning of a person's name right after another person's name
+    ...     "I-PER",   # Person's name
+    ...     "B-ORG",   # Beginning of an organisation right after another organisation
+    ...     "I-ORG",   # Organisation
+    ...     "B-LOC",   # Beginning of a location right after another location
+    ...     "I-LOC"    # Location
+    ... ]
 
-    label_list = [
-        "O",       # Outside of a named entity
-        "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
-        "I-MISC",  # Miscellaneous entity
-        "B-PER",   # Beginning of a person's name right after another person's name
-        "I-PER",   # Person's name
-        "B-ORG",   # Beginning of an organisation right after another organisation
-        "I-ORG",   # Organisation
-        "B-LOC",   # Beginning of a location right after another location
-        "I-LOC"    # Location
-    ]
+    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+    ...            "close to the Manhattan Bridge."
 
-    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
-               "close to the Manhattan Bridge."
+    >>> # Bit of a hack to get the tokens with the special tokens
+    >>> tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
+    >>> inputs = tokenizer.encode(sequence, return_tensors="tf")
 
-    # Bit of a hack to get the tokens with the special tokens
-    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
-    inputs = tokenizer.encode(sequence, return_tensors="tf")
+    >>> outputs = model(inputs)[0]
+    >>> predictions = tf.argmax(outputs, axis=2)
 
-    outputs = model(inputs)[0]
-    predictions = tf.argmax(outputs, axis=2)
-
-    print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())])
 
 This outputs a list of each token mapped to their prediction. Differently from the pipeline, here every token has
 a prediction as we didn't remove the "0" class which means that no particular entity was found on that token. The
 following array should be the output:
 
-::
+.. code-block::
 
+    >>> print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())])
     [('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]
 
 Summarization
@@ -705,41 +718,40 @@ If you would like to fine-tune a model on a summarization task, you may leverage
 Here is an example using the pipelines do to summarization. 
 It leverages a Bart model that was fine-tuned on the CNN / Daily Mail data set.
 
-::
+.. code-block::
 
-    from transformers import pipeline
+    >>> from transformers import pipeline
 
-    summarizer = pipeline("summarization")
+    >>> summarizer = pipeline("summarization")
 
-    ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. 
-    A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. 
-    Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. 
-    In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. 
-    Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 
-    2010 marriage license application, according to court documents. 
-    Prosecutors said the marriages were part of an immigration scam. 
-    On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. 
-    After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective 
-    Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. 
-    All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. 
-    Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages. 
-    Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. 
-    The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s 
-    Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. 
-    Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. 
-    If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.
-    """
-    
-    print(summarizer(ARTICLE, max_length=130, min_length=30))
+    >>> ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
+    ... A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
+    ... Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
+    ... In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
+    ... Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
+    ... 2010 marriage license application, according to court documents.
+    ... Prosecutors said the marriages were part of an immigration scam.
+    ... On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further.
+    ... After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective
+    ... Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.
+    ... All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.
+    ... Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.
+    ... Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.
+    ... The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s
+    ... Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.
+    ... Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.
+    ... If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.
+    ... """
 
 Because the summarization pipeline depends on the ``PretrainedModel.generate()`` method, we can override the default arguments 
 of ``PretrainedModel.generate()`` directly in the pipeline as is shown for ``max_length`` and ``min_length`` above.
 This outputs the following summary:
 
-::
+.. code-block::
+
+    >>> print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))
+    [{'summary_text': 'Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and 2002. She is believed to still be married to four men.'}]
 
-  Liana Barrientos has been married 10 times, sometimes within two weeks of each other. Prosecutors say the marriages were part of an immigration scam. She pleaded not guilty at State Supreme Court in the Bronx on Friday.
-  
 Here is an example doing summarization using a model and a tokenizer. The process is the following:
 
 - Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder model, such as ``Bart`` or ``T5``.
@@ -748,29 +760,26 @@ Here is an example doing summarization using a model and a tokenizer. The proces
 - Add the T5 specific prefix "summarize: ".
 
 Here Google`s T5 model is used that was only pre-trained on a multi-task mixed data set (including CNN / Daily Mail), but nevertheless yields very good results.
-::
+.. code-block::
 
-    ## PYTORCH CODE
-    from transformers import AutoModelWithLMHead, AutoTokenizer
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
 
-    model = AutoModelWithLMHead.from_pretrained("t5-base")
-    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+    >>> model = AutoModelWithLMHead.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
 
-    # T5 uses a max_length of 512 so we cut the article to 512 tokens.
-    inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="pt", max_length=512)
-    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
-    print(outputs)
-    
-    ## TENSORFLOW CODE
-    from transformers import TFAutoModelWithLMHead, AutoTokenizer
+    >>> # T5 uses a max_length of 512 so we cut the article to 512 tokens.
+    >>> inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="pt", max_length=512)
+    >>> outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
 
-    model = TFAutoModelWithLMHead.from_pretrained("t5-base")
-    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+    >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
 
-    # T5 uses a max_length of 512 so we cut the article to 512 tokens.
-    inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="tf", max_length=512)
-    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
-    print(outputs)
+    >>> # T5 uses a max_length of 512 so we cut the article to 512 tokens.
+    >>> inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="tf", max_length=512)
+    >>> outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
 
 Translation
 ----------------------------------------------------
@@ -784,12 +793,13 @@ Here is an example using the pipelines do to translation.
 It leverages a T5 model that was only pre-trained on a multi-task mixture dataset (including WMT), but yields impressive 
 translation results nevertheless.
 
-::
+.. code-block::
 
-    from transformers import pipeline
+    >>> from transformers import pipeline
 
-    translator = pipeline("translation_en_to_de")
-    print(translator("Hugging Face is a technology company based in New York and Paris", max_length=40))
+    >>> translator = pipeline("translation_en_to_de")
+    >>> print(translator("Hugging Face is a technology company based in New York and Paris", max_length=40))
+    [{'translation_text': 'Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.'}]
 
 Because the translation pipeline depends on the ``PretrainedModel.generate()`` method, we can override the default arguments 
 of ``PretrainedModel.generate()`` directly in the pipeline as is shown for ``max_length`` above.
@@ -806,26 +816,30 @@ Here is an example doing translation using a model and a tokenizer. The process
 - Leverage the ``PretrainedModel.generate()`` method.
 - Add the T5 specific prefix "translate English to German: "
 
-::
+.. code-block::
 
-    ## PYTORCH CODE
-    from transformers import AutoModelWithLMHead, AutoTokenizer
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
 
-    model = AutoModelWithLMHead.from_pretrained("t5-base")
-    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+    >>> model = AutoModelWithLMHead.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
 
-    inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="pt")
-    outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
+    >>> inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="pt")
+    >>> outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
 
-    print(outputs)
-    
-    ## TENSORFLOW CODE
-    from transformers import TFAutoModelWithLMHead, AutoTokenizer
+    >>> print(outputs)
+    tensor([[    0, 11560,  3896,  8881,   229,   236,     3, 14366, 15377,   181,
+             11216,    16,   368,  1060,    64,  1919,     5]])
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
 
-    model = TFAutoModelWithLMHead.from_pretrained("t5-base")
-    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+    >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
 
-    inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="tf")
-    outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
+    >>> inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="tf")
+    >>> outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
 
-    print(outputs)
+    >>> print(outputs)
+    tf.Tensor(
+    [[    0 11560  3896  8881   229   236     3 14366 15377   181 11216    16
+        368  1060    64  1919     5]], shape=(1, 17), dtype=int32)
diff --git a/setup.py b/setup.py
index 1ab51ee97e..1fa483c329 100644
--- a/setup.py
+++ b/setup.py
@@ -86,7 +86,7 @@ extras["all"] = extras["serving"] + ["tensorflow", "torch"]
 
 extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "psutil"]
 # sphinx-rtd-theme==0.5.0 introduced big changes in the style.
-extras["docs"] = ["recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme==0.4.3"]
+extras["docs"] = ["recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme==0.4.3", "sphinx-copybutton"]
 extras["quality"] = [
     "black",
     "isort @ git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort",
diff --git a/src/transformers/configuration_albert.py b/src/transformers/configuration_albert.py
index 899d5da388..5dc43019f8 100644
--- a/src/transformers/configuration_albert.py
+++ b/src/transformers/configuration_albert.py
@@ -81,22 +81,22 @@ class AlbertConfig(PretrainedConfig):
 
         Example::
 
-            from transformers import AlbertConfig, AlbertModel
-            # Initializing an ALBERT-xxlarge style configuration
-            albert_xxlarge_configuration = AlbertConfig()
+            >>> from transformers import AlbertConfig, AlbertModel
+            >>> # Initializing an ALBERT-xxlarge style configuration
+            >>> albert_xxlarge_configuration = AlbertConfig()
 
-            # Initializing an ALBERT-base style configuration
-            albert_base_configuration = AlbertConfig(
-                hidden_size=768,
-                num_attention_heads=12,
-                intermediate_size=3072,
-            )
+            >>> # Initializing an ALBERT-base style configuration
+            >>> albert_base_configuration = AlbertConfig(
+            ...      hidden_size=768,
+            ...      num_attention_heads=12,
+            ...      intermediate_size=3072,
+            ...  )
 
-            # Initializing a model from the ALBERT-base style configuration
-            model = AlbertModel(albert_xxlarge_configuration)
+            >>> # Initializing a model from the ALBERT-base style configuration
+            >>> model = AlbertModel(albert_xxlarge_configuration)
 
-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
     """
 
     model_type = "albert"
diff --git a/src/transformers/configuration_bart.py b/src/transformers/configuration_bart.py
index be87589923..398ff7c4ed 100644
--- a/src/transformers/configuration_bart.py
+++ b/src/transformers/configuration_bart.py
@@ -73,9 +73,13 @@ class BartConfig(PretrainedConfig):
     ):
         r"""
             :class:`~transformers.BartConfig` is the configuration class for `BartModel`.
-            Examples:
-                config = BartConfig.from_pretrained('bart-large')
-                model = BartModel(config)
+
+            Examples::
+
+                >>> from transformers import BartConfig, BartModel
+
+                >>> config = BartConfig.from_pretrained('facebook/bart-large')
+                >>> model = BartModel(config)
         """
         if "hidden_size" in common_kwargs:
             raise ValueError("hidden size is called d_model")
diff --git a/src/transformers/configuration_bert.py b/src/transformers/configuration_bert.py
index b1beceb215..8e815837bc 100644
--- a/src/transformers/configuration_bert.py
+++ b/src/transformers/configuration_bert.py
@@ -95,16 +95,16 @@ class BertConfig(PretrainedConfig):
 
         Example::
 
-            from transformers import BertModel, BertConfig
+            >>> from transformers import BertModel, BertConfig
 
-            # Initializing a BERT bert-base-uncased style configuration
-            configuration = BertConfig()
+            >>> # Initializing a BERT bert-base-uncased style configuration
+            >>> configuration = BertConfig()
 
-            # Initializing a model from the bert-base-uncased style configuration
-            model = BertModel(configuration)
+            >>> # Initializing a model from the bert-base-uncased style configuration
+            >>> model = BertModel(configuration)
 
-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
     """
     model_type = "bert"
 
diff --git a/src/transformers/configuration_ctrl.py b/src/transformers/configuration_ctrl.py
index 54f5931fd2..1059525691 100644
--- a/src/transformers/configuration_ctrl.py
+++ b/src/transformers/configuration_ctrl.py
@@ -66,16 +66,16 @@ class CTRLConfig(PretrainedConfig):
 
         Example::
 
-            from transformers import CTRLModel, CTRLConfig
+            >>> from transformers import CTRLModel, CTRLConfig
 
-            # Initializing a CTRL configuration
-            configuration = CTRLConfig()
+            >>> # Initializing a CTRL configuration
+            >>> configuration = CTRLConfig()
 
-            # Initializing a model from the configuration
-            model = CTRLModel(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = CTRLModel(configuration)
 
-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
     """
 
     model_type = "ctrl"
diff --git a/src/transformers/configuration_distilbert.py b/src/transformers/configuration_distilbert.py
index 3f74e4f2c1..9bd9baf228 100644
--- a/src/transformers/configuration_distilbert.py
+++ b/src/transformers/configuration_distilbert.py
@@ -80,16 +80,16 @@ class DistilBertConfig(PretrainedConfig):
 
         Example::
 
-            from transformers import DistilBertModel, DistilBertConfig
+            >>> from transformers import DistilBertModel, DistilBertConfig
 
-            # Initializing a DistilBERT configuration
-            configuration = DistilBertConfig()
+            >>> # Initializing a DistilBERT configuration
+            >>> configuration = DistilBertConfig()
 
-            # Initializing a model from the configuration
-            model = DistilBertModel(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = DistilBertModel(configuration)
 
-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
     """
     model_type = "distilbert"
 
diff --git a/src/transformers/configuration_electra.py b/src/transformers/configuration_electra.py
index 9aafb85958..be8c7d6d8b 100644
--- a/src/transformers/configuration_electra.py
+++ b/src/transformers/configuration_electra.py
@@ -101,16 +101,16 @@ class ElectraConfig(PretrainedConfig):
 
         Example::
 
-            from transformers import ElectraModel, ElectraConfig
+            >>> from transformers import ElectraModel, ElectraConfig
 
-            # Initializing a ELECTRA electra-base-uncased style configuration
-            configuration = ElectraConfig()
+            >>> # Initializing a ELECTRA electra-base-uncased style configuration
+            >>> configuration = ElectraConfig()
 
-            # Initializing a model from the electra-base-uncased style configuration
-            model = ElectraModel(configuration)
+            >>> # Initializing a model from the electra-base-uncased style configuration
+            >>> model = ElectraModel(configuration)
 
-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
     """
     model_type = "electra"
 
diff --git a/src/transformers/configuration_encoder_decoder.py b/src/transformers/configuration_encoder_decoder.py
index 08956f3ade..261fa505e9 100644
--- a/src/transformers/configuration_encoder_decoder.py
+++ b/src/transformers/configuration_encoder_decoder.py
@@ -42,20 +42,20 @@ class EncoderDecoderConfig(PretrainedConfig):
 
         Example::
 
-            from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
+            >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
 
-            # Initializing a BERT bert-base-uncased style configuration
-            config_encoder = BertConfig()
-            config_decoder = BertConfig()
+            >>> # Initializing a BERT bert-base-uncased style configuration
+            >>> config_encoder = BertConfig()
+            >>> config_decoder = BertConfig()
 
-            config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+            >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
 
-            # Initializing a Bert2Bert model from the bert-base-uncased style configurations
-            model = EncoderDecoderModel(config=config)
+            >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
+            >>> model = EncoderDecoderModel(config=config)
 
-            # Accessing the model configuration
-            config_encoder = model.config.encoder
-            config_decoder  = model.config.decoder
+            >>> # Accessing the model configuration
+            >>> config_encoder = model.config.encoder
+            >>> config_decoder  = model.config.decoder
     """
     model_type = "encoder_decoder"
 
diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py
index 0d282637dd..814846cbde 100644
--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -100,16 +100,16 @@ class GPT2Config(PretrainedConfig):
 
         Example::
 
-            from transformers import GPT2Model, GPT2Config
+            >>> from transformers import GPT2Model, GPT2Config
 
-            # Initializing a GPT2 configuration
-            configuration = GPT2Config()
+            >>> # Initializing a GPT2 configuration
+            >>> configuration = GPT2Config()
 
-            # Initializing a model from the configuration
-            model = GPT2Model(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = GPT2Model(configuration)
 
-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
     """
 
     model_type = "gpt2"
diff --git a/src/transformers/configuration_longformer.py b/src/transformers/configuration_longformer.py
index e8daf49e11..a9081cf7a9 100644
--- a/src/transformers/configuration_longformer.py
+++ b/src/transformers/configuration_longformer.py
@@ -49,16 +49,16 @@ class LongformerConfig(RobertaConfig):
 
         Example::
 
-            from transformers import LongformerConfig, LongformerModel
+            >>> from transformers import LongformerConfig, LongformerModel
 
-            # Initializing a Longformer configuration
-            configuration = LongformerConfig()
+            >>> # Initializing a Longformer configuration
+            >>> configuration = LongformerConfig()
 
-            # Initializing a model from the configuration
-            model = LongformerModel(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = LongformerModel(configuration)
 
-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
     """
     model_type = "longformer"
 
diff --git a/src/transformers/configuration_mobilebert.py b/src/transformers/configuration_mobilebert.py
index cfb16baf2c..d70c2f3e7b 100644
--- a/src/transformers/configuration_mobilebert.py
+++ b/src/transformers/configuration_mobilebert.py
@@ -85,16 +85,16 @@ class MobileBertConfig(PretrainedConfig):
 
         Example:
 
-            from transformers import MobileBertModel, MobileBertConfig
+            >>> from transformers import MobileBertModel, MobileBertConfig
 
-            # Initializing a MobileBERT configuration
-            configuration = MobileBertConfig()
+            >>> # Initializing a MobileBERT configuration
+            >>> configuration = MobileBertConfig()
 
-            # Initializing a model from the configuration above
-            model = MobileBertModel(configuration)
+            >>> # Initializing a model from the configuration above
+            >>> model = MobileBertModel(configuration)
 
-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
 
         Attributes:
             pretrained_config_archive_map (Dict[str, str]):
diff --git a/src/transformers/configuration_openai.py b/src/transformers/configuration_openai.py
index 39080196a1..deeed60ea7 100644
--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -98,16 +98,16 @@ class OpenAIGPTConfig(PretrainedConfig):
 
         Example::
 
-            from transformers import OpenAIGPTConfig, OpenAIGPTModel
+            >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
 
-            # Initializing a GPT configuration
-            configuration = OpenAIGPTConfig()
+            >>> # Initializing a GPT configuration
+            >>> configuration = OpenAIGPTConfig()
 
-            # Initializing a model from the configuration
-            model = OpenAIGPTModel(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = OpenAIGPTModel(configuration)
 
-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
     """
 
     model_type = "openai-gpt"
diff --git a/src/transformers/configuration_reformer.py b/src/transformers/configuration_reformer.py
index 89daed5b69..55e12b02ab 100644
--- a/src/transformers/configuration_reformer.py
+++ b/src/transformers/configuration_reformer.py
@@ -125,16 +125,16 @@ class ReformerConfig(PretrainedConfig):
 
         Example::
 
-            from transformers import ReformerModel, ReformerConfig
+            >>> from transformers import ReformerModel, ReformerConfig
 
-            # Initializing a Reformer configuration
-            configuration = ReformerConfig()
+            >>> # Initializing a Reformer configuration
+            >>> configuration = ReformerConfig()
 
-            # Initializing a Reformer model
-            model = ReformerModel(configuration)
+            >>> # Initializing a Reformer model
+            >>> model = ReformerModel(configuration)
 
-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
     """
     model_type = "reformer"
 
diff --git a/src/transformers/configuration_roberta.py b/src/transformers/configuration_roberta.py
index a0ac8dcc96..439888e269 100644
--- a/src/transformers/configuration_roberta.py
+++ b/src/transformers/configuration_roberta.py
@@ -49,16 +49,16 @@ class RobertaConfig(BertConfig):
 
         Example::
 
-            from transformers import RobertaConfig, RobertaModel
+            >>> from transformers import RobertaConfig, RobertaModel
 
-            # Initializing a RoBERTa configuration
-            configuration = RobertaConfig()
+            >>> # Initializing a RoBERTa configuration
+            >>> configuration = RobertaConfig()
 
-            # Initializing a model from the configuration
-            model = RobertaModel(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = RobertaModel(configuration)
 
-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
     """
     model_type = "roberta"
 
diff --git a/src/transformers/configuration_transfo_xl.py b/src/transformers/configuration_transfo_xl.py
index 8e26616696..c3c6a22b82 100644
--- a/src/transformers/configuration_transfo_xl.py
+++ b/src/transformers/configuration_transfo_xl.py
@@ -100,16 +100,16 @@ class TransfoXLConfig(PretrainedConfig):
 
         Example::
 
-            from transformers import TransfoXLConfig, TransfoXLModel
+            >>> from transformers import TransfoXLConfig, TransfoXLModel
 
-            # Initializing a Transformer XL configuration
-            configuration = TransfoXLConfig()
+            >>> # Initializing a Transformer XL configuration
+            >>> configuration = TransfoXLConfig()
 
-            # Initializing a model from the configuration
-            model = TransfoXLModel(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = TransfoXLModel(configuration)
 
-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
     """
 
     model_type = "transfo-xl"
diff --git a/src/transformers/configuration_xlm.py b/src/transformers/configuration_xlm.py
index 1655119f41..d87321a14f 100644
--- a/src/transformers/configuration_xlm.py
+++ b/src/transformers/configuration_xlm.py
@@ -142,16 +142,16 @@ class XLMConfig(PretrainedConfig):
 
         Example::
 
-            from transformers import XLMConfig, XLMModel
+            >>> from transformers import XLMConfig, XLMModel
 
-            # Initializing a XLM configuration
-            configuration = XLMConfig()
+            >>> # Initializing a XLM configuration
+            >>> configuration = XLMConfig()
 
-            # Initializing a model from the configuration
-            model = XLMModel(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = XLMModel(configuration)
 
-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
     """
 
     model_type = "xlm"
diff --git a/src/transformers/configuration_xlnet.py b/src/transformers/configuration_xlnet.py
index 2c17696805..355ae36b62 100644
--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
@@ -113,16 +113,16 @@ class XLNetConfig(PretrainedConfig):
 
         Example::
 
-            from transformers import XLNetConfig, XLNetModel
+            >>> from transformers import XLNetConfig, XLNetModel
 
-            # Initializing a XLNet configuration
-            configuration = XLNetConfig()
+            >>> # Initializing a XLNet configuration
+            >>> configuration = XLNetConfig()
 
-            # Initializing a model from the configuration
-            model = XLNetModel(configuration)
+            >>> # Initializing a model from the configuration
+            >>> model = XLNetModel(configuration)
 
-            # Accessing the model configuration
-            configuration = model.config
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
     """
 
     model_type = "xlnet"
diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index b717774a61..9ca27d7ff4 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -488,11 +488,11 @@ class SquadProcessor(DataProcessor):
 
         Examples::
 
-            import tensorflow_datasets as tfds
-            dataset = tfds.load("squad")
+            >>> import tensorflow_datasets as tfds
+            >>> dataset = tfds.load("squad")
 
-            training_examples = get_examples_from_dataset(dataset, evaluate=False)
-            evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
+            >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
+            >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
         """
 
         if evaluate:
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 19b9bc25eb..b2a3c4f611 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -186,6 +186,263 @@ def add_end_docstrings(*docstr):
     return docstring_decorator
 
 
+PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)  # Batch size 1
+
+        >>> outputs = model(**inputs, labels=labels)
+        >>> loss, scores = outputs[:2]
+"""
+
+PT_QUESTION_ANSWERING_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+
+        >>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
+        >>> loss, start_scores, end_scores = outputs[:3]
+"""
+
+PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(**inputs, labels=labels)
+        >>> loss, logits = outputs[:2]
+"""
+
+PT_MASKED_LM_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
+
+        >>> outputs = model(input_ids, labels=input_ids)
+        >>> loss, prediction_scores = outputs[:2]
+"""
+
+PT_BASE_MODEL_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+"""
+
+PT_MULTIPLE_CHOICE_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> choice0 = "It is eaten with a fork and a knife."
+        >>> choice1 = "It is eaten while held in the hand."
+        >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
+
+        >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', pad_to_max_length=True)
+        >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels)  # batch size is 1
+
+        >>> # the linear classifier still needs to be trained
+        >>> loss, logits = outputs[:2]
+"""
+
+PT_CAUSAL_LM_SAMPLE = r"""
+    Example::
+
+        >>> import torch
+        >>> from transformers import {tokenizer_class}, {model_class}
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs, labels=inputs["input_ids"])
+        >>> loss, logits = outputs[:2]
+"""
+
+TF_TOKEN_CLASSIFICATION_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+        >>> input_ids = inputs["input_ids"]
+        >>> inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
+
+        >>> outputs = model(inputs)
+        >>> loss, scores = outputs[:2]
+"""
+
+TF_QUESTION_ANSWERING_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        >>> input_dict = tokenizer(question, text, return_tensors='tf')
+        >>> start_scores, end_scores = model(input_dict)
+
+        >>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
+        >>> answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
+"""
+
+TF_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+        >>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
+
+        >>> outputs = model(inputs)
+        >>> loss, logits = outputs[:2]
+"""
+
+TF_MASKED_LM_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+
+        >>> outputs = model(input_ids)
+        >>> prediction_scores = outputs[0]
+"""
+
+TF_BASE_MODEL_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+        >>> outputs = model(inputs)
+
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+"""
+
+TF_MULTIPLE_CHOICE_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> choice0 = "It is eaten with a fork and a knife."
+        >>> choice1 = "It is eaten while held in the hand."
+
+        >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', pad_to_max_length=True)
+        >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
+        >>> outputs = model(inputs)  # batch size is 1
+
+        >>> # the linear classifier still needs to be trained
+        >>> logits = outputs[0]
+"""
+
+TF_CAUSAL_LM_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+        >>> outputs = model(inputs)
+        >>> logits = outputs[0]
+"""
+
+
+def add_code_sample_docstrings(*docstr, tokenizer_class=None, checkpoint=None):
+    def docstring_decorator(fn):
+        model_class = fn.__qualname__.split(".")[0]
+        is_tf_class = model_class[:2] == "TF"
+
+        if "SequenceClassification" in model_class:
+            code_sample = TF_SEQUENCE_CLASSIFICATION_SAMPLE if is_tf_class else PT_SEQUENCE_CLASSIFICATION_SAMPLE
+        elif "QuestionAnswering" in model_class:
+            code_sample = TF_QUESTION_ANSWERING_SAMPLE if is_tf_class else PT_QUESTION_ANSWERING_SAMPLE
+        elif "TokenClassification" in model_class:
+            code_sample = TF_TOKEN_CLASSIFICATION_SAMPLE if is_tf_class else PT_TOKEN_CLASSIFICATION_SAMPLE
+        elif "MultipleChoice" in model_class:
+            code_sample = TF_MULTIPLE_CHOICE_SAMPLE if is_tf_class else PT_MULTIPLE_CHOICE_SAMPLE
+        elif "MaskedLM" in model_class:
+            code_sample = TF_MASKED_LM_SAMPLE if is_tf_class else PT_MASKED_LM_SAMPLE
+        elif "LMHead" in model_class:
+            code_sample = TF_CAUSAL_LM_SAMPLE if is_tf_class else PT_CAUSAL_LM_SAMPLE
+        elif "Model" in model_class:
+            code_sample = TF_BASE_MODEL_SAMPLE if is_tf_class else PT_BASE_MODEL_SAMPLE
+        else:
+            raise ValueError(f"Docstring can't be built for model {model_class}")
+
+        built_doc = code_sample.format(model_class=model_class, tokenizer_class=tokenizer_class, checkpoint=checkpoint)
+        fn.__doc__ = (fn.__doc__ or "") + "".join(docstr) + built_doc
+        return fn
+
+    return docstring_decorator
+
+
 def is_remote_url(url_or_filename):
     parsed = urlparse(url_or_filename)
     return parsed.scheme in ("http", "https")
diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py
index fb86c3d62b..cc8390a149 100644
--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -24,13 +24,15 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .configuration_albert import AlbertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_bert import ACT2FN, BertEmbeddings, BertSelfAttention, prune_linear_layer
 from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices
 
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "AlbertTokenizer"
+
 
 ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "albert-base-v1",
@@ -485,6 +487,7 @@ class AlbertModel(AlbertPreTrainedModel):
             self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
 
     @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
     def forward(
         self,
         input_ids=None,
@@ -521,18 +524,6 @@ class AlbertModel(AlbertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Example::
-
-        from transformers import AlbertModel, AlbertTokenizer
-        import torch
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertModel.from_pretrained('albert-base-v2')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -657,16 +648,16 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
 
     Examples::
 
-        from transformers import AlbertTokenizer, AlbertForPreTraining
-        import torch
+        >>> from transformers import AlbertTokenizer, AlbertForPreTraining
+        >>> import torch
 
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForPreTraining.from_pretrained('albert-base-v2')
+        >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+        >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2')
 
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
 
-        prediction_scores, sop_scores = outputs[:2]
+        >>> prediction_scores, sop_scores = outputs[:2]
 
         """
 
@@ -763,6 +754,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
         return self.predictions.decoder
 
     @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
     def forward(
         self,
         input_ids=None,
@@ -802,18 +794,6 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Example::
-
-        from transformers import AlbertTokenizer, AlbertForMaskedLM
-        import torch
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForMaskedLM.from_pretrained('albert-base-v2')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, prediction_scores = outputs[:2]
-
         """
         if "masked_lm_labels" in kwargs:
             warnings.warn(
@@ -863,6 +843,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
     def forward(
         self,
         input_ids=None,
@@ -899,19 +880,6 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-        Examples::
-
-            from transformers import AlbertTokenizer, AlbertForSequenceClassification
-            import torch
-
-            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-            model = AlbertForSequenceClassification.from_pretrained('albert-base-v2')
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-            labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=labels)
-            loss, logits = outputs[:2]
-
         """
 
         outputs = self.albert(
@@ -962,6 +930,7 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
     def forward(
         self,
         input_ids=None,
@@ -996,21 +965,6 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import AlbertTokenizer, AlbertForTokenClassification
-        import torch
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForTokenClassification.from_pretrained('albert-base-v2')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, scores = outputs[:2]
-
         """
 
         outputs = self.albert(
@@ -1062,6 +1016,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
     def forward(
         self,
         input_ids=None,
@@ -1104,21 +1059,6 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        # The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
-        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
-
-        from transformers import AlbertTokenizer, AlbertForQuestionAnswering
-        import torch
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForQuestionAnswering.from_pretrained('albert-base-v2')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='pt')
-        start_scores, end_scores = model(**input_dict)
-
         """
 
         outputs = self.albert(
@@ -1176,6 +1116,7 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
     def forward(
         self,
         input_ids=None,
@@ -1213,25 +1154,6 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import AlbertTokenizer, AlbertForMultipleChoice
-        import torch
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = AlbertForMultipleChoice.from_pretrained('albert-base-v2')
-
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-
-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
-        outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
-
-        # the linear classifier still needs to be trained
-        loss, logits = outputs[:2]
         """
         num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
 
diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py
index e06b6b8a07..fff71ef686 100644
--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -392,8 +392,8 @@ class AutoModel:
 
         Examples::
 
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = AutoModel.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            >>> config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            >>> model = AutoModel.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
         """
         for config_class, model_class in MODEL_MAPPING.items():
             if isinstance(config, config_class):
@@ -480,8 +480,7 @@ class AutoModel:
         Examples::
 
             model = AutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = AutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            assert model.config.output_attention == True
+            assert model.config.output_attentions == True
             # Loading from a TF checkpoint file instead of a PyTorch model (slower)
             config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
             model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
@@ -547,8 +546,8 @@ class AutoModelForPreTraining:
 
         Examples::
 
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = AutoModelForPreTraining.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            >>> config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            >>> model = AutoModelForPreTraining.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
         """
         for config_class, model_class in MODEL_FOR_PRETRAINING_MAPPING.items():
             if isinstance(config, config_class):
diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 8f090f5b17..76f94d016e 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -27,12 +27,19 @@ from torch.nn import CrossEntropyLoss
 
 from .activations import ACT2FN
 from .configuration_bart import BartConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_utils import PreTrainedModel
 
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "BartTokenizer"
+
 
 BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "facebook/bart-large",
@@ -56,14 +63,17 @@ BART_START_DOCSTRING = r"""
 
 """
 BART_GENERATION_EXAMPLE = r"""
-    Examples::
+    Summarization example::
 
         from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
+
         # see ``examples/summarization/bart/run_eval.py`` for a longer example
         model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
         tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+
         ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
-        inputs = tokenizer.batch_encode_plus([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+        inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+
         # Generate Summary
         summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
         print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
@@ -807,6 +817,7 @@ class BartModel(PretrainedBartModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
     def forward(
         self,
         input_ids,
@@ -883,8 +894,7 @@ class BartModel(PretrainedBartModel):
 
 
 @add_start_docstrings(
-    "The BART Model with a language modeling head. Can be used for summarization.",
-    BART_START_DOCSTRING + BART_GENERATION_EXAMPLE,
+    "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
 )
 class BartForConditionalGeneration(PretrainedBartModel):
     base_model_prefix = "model"
@@ -911,6 +921,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
         self.register_buffer("final_logits_bias", new_bias)
 
     @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_end_docstrings(BART_GENERATION_EXAMPLE)
     def forward(
         self,
         input_ids,
@@ -951,18 +962,21 @@ class BartForConditionalGeneration(PretrainedBartModel):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
 
-    Examples::
+    Conditional generation example::
 
             # Mask filling only works for bart-large
             from transformers import BartTokenizer, BartForConditionalGeneration
-            tokenizer = BartTokenizer.from_pretrained('bart-large')
+            tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
             TXT = "My friends are <mask> but they eat too many carbs."
-            model = BartForConditionalGeneration.from_pretrained('bart-large')
-            input_ids = tokenizer.batch_encode_plus([TXT], return_tensors='pt')['input_ids']
+
+            model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
+            input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
             logits = model(input_ids)[0]
+
             masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
             probs = logits[0, masked_index].softmax(dim=0)
             values, predictions = probs.topk(5)
+
             tokenizer.decode(predictions).split()
             # ['good', 'great', 'all', 'really', 'very']
         """
@@ -1068,6 +1082,7 @@ class BartForSequenceClassification(PretrainedBartModel):
         self.model._init_weights(self.classification_head.out_proj)
 
     @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
     def forward(
         self,
         input_ids,
@@ -1088,32 +1103,19 @@ class BartForSequenceClassification(PretrainedBartModel):
 
     Returns:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BartConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification loss (cross entropy)
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-
-    Examples::
-
-        from transformers import BartTokenizer, BartForSequenceClassification
-        import torch
-
-        tokenizer = BartTokenizer.from_pretrained('bart-large')
-        model = BartForSequenceClassification.from_pretrained('bart-large')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute",
-        add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
+            loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+                Classification loss (cross entropy)
+            logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+                Classification (or regression if config.num_labels==1) scores (before SoftMax).
+            hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+                of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+                Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+            attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+                Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+                Attentions weights after the attention softmax, used to compute the weighted average in the
+                self-attention
+                heads.
         """
         if labels is not None:
             use_cache = False
@@ -1161,6 +1163,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
         self.model._init_weights(self.qa_outputs)
 
     @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="facebook/bart-large")
     def forward(
         self,
         input_ids,
@@ -1200,25 +1203,6 @@ class BartForQuestionAnswering(PretrainedBartModel):
             Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        # The checkpoint bart-large is not fine-tuned for question answering. Please see the
-        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
-
-        from transformers import BartTokenizer, BartForQuestionAnswering
-        import torch
-
-        tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
-        model = BartForQuestionAnswering.from_pretrained('facebook/bart-large')
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_ids = tokenizer.encode(question, text)
-        start_scores, end_scores = model(torch.tensor([input_ids]))
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-
         """
         if start_positions is not None and end_positions is not None:
             use_cache = False
@@ -1259,7 +1243,7 @@ class BartForQuestionAnswering(PretrainedBartModel):
             total_loss = (start_loss + end_loss) / 2
             outputs = (total_loss,) + outputs
 
-        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
+        return outputs  # return outputs  # (loss), start_logits, end_logits, encoder_outputs, (hidden_states), (attentions)
 
 
 class SinusoidalPositionalEmbedding(nn.Embedding):
diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index a8bcc76c5a..75a7345c8e 100644
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -28,12 +28,14 @@ from torch.nn import CrossEntropyLoss, MSELoss
 
 from .activations import gelu, gelu_new, swish
 from .configuration_bert import BertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "BertTokenizer"
+
 BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "bert-base-uncased",
     "bert-large-uncased",
@@ -664,6 +666,7 @@ class BertModel(BertPreTrainedModel):
             self.encoder.layer[layer].attention.prune_heads(heads)
 
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(
         self,
         input_ids=None,
@@ -702,20 +705,6 @@ class BertModel(BertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import BertModel, BertTokenizer
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertModel.from_pretrained('bert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -851,16 +840,16 @@ class BertForPreTraining(BertPreTrainedModel):
 
     Examples::
 
-        from transformers import BertTokenizer, BertForPreTraining
-        import torch
+        >>> from transformers import BertTokenizer, BertForPreTraining
+        >>> import torch
 
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForPreTraining.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
 
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
 
-        prediction_scores, seq_relationship_scores = outputs[:2]
+        >>> prediction_scores, seq_relationship_scores = outputs[:2]
 
         """
         if "masked_lm_labels" in kwargs:
@@ -958,19 +947,20 @@ class BertLMHeadModel(BertPreTrainedModel):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
 
-        Examples::
+    Example::
 
-            from transformers import BertTokenizer, BertLMHeadModel
-            import torch
+        >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+        >>> import torch
 
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = BertLMHeadModel.from_pretrained('bert-base-uncased', is_decoder=True)
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        >>> config = BertConfig.from_pretrained("bert-base-cased")
+        >>> config.is_decoder = True
+        >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
 
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=input_ids)
-
-            loss, prediction_scores = outputs[:2]
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
 
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
         """
 
         outputs = self.bert(
@@ -1028,6 +1018,7 @@ class BertForMaskedLM(BertPreTrainedModel):
         return self.cls.predictions.decoder
 
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(
         self,
         input_ids=None,
@@ -1069,20 +1060,6 @@ class BertForMaskedLM(BertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-        Examples::
-
-            from transformers import BertTokenizer, BertForMaskedLM
-            import torch
-
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = BertForMaskedLM.from_pretrained('bert-base-uncased')
-
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=input_ids)
-
-            loss, prediction_scores = outputs[:2]
-
         """
         if "masked_lm_labels" in kwargs:
             warnings.warn(
@@ -1185,18 +1162,18 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
 
     Examples::
 
-        from transformers import BertTokenizer, BertForNextSentencePrediction
-        import torch
+        >>> from transformers import BertTokenizer, BertForNextSentencePrediction
+        >>> import torch
 
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
 
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
 
-        loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
-        assert logits[0, 0] < logits[0, 1] # next sentence was random
+        >>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
+        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
         """
 
         outputs = self.bert(
@@ -1240,6 +1217,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(
         self,
         input_ids=None,
@@ -1276,21 +1254,6 @@ class BertForSequenceClassification(BertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForSequenceClassification
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, logits = outputs[:2]
-
         """
 
         outputs = self.bert(
@@ -1340,6 +1303,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(
         self,
         input_ids=None,
@@ -1377,25 +1341,6 @@ class BertForMultipleChoice(BertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForMultipleChoice
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
-
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-
-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
-        outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
-
-        # the linear classifier still needs to be trained
-        loss, logits = outputs[:2]
         """
         num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
 
@@ -1453,6 +1398,7 @@ class BertForTokenClassification(BertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(
         self,
         input_ids=None,
@@ -1487,21 +1433,6 @@ class BertForTokenClassification(BertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForTokenClassification
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForTokenClassification.from_pretrained('bert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, scores = outputs[:2]
-
         """
 
         outputs = self.bert(
@@ -1554,6 +1485,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
     def forward(
         self,
         input_ids=None,
@@ -1596,25 +1528,6 @@ class BertForQuestionAnswering(BertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForQuestionAnswering
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        encoding = tokenizer.encode_plus(question, text)
-        input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
-        start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-
-        assert answer == "a nice puppet"
-
         """
 
         outputs = self.bert(
diff --git a/src/transformers/modeling_camembert.py b/src/transformers/modeling_camembert.py
index 00dd5a74d0..5ee148bc36 100644
--- a/src/transformers/modeling_camembert.py
+++ b/src/transformers/modeling_camembert.py
@@ -31,6 +31,8 @@ from .modeling_roberta import (
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "CamembertTokenizer"
+
 CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "camembert-base",
     "Musixmatch/umberto-commoncrawl-cased-v1",
diff --git a/src/transformers/modeling_ctrl.py b/src/transformers/modeling_ctrl.py
index 719b8ccdf7..3f11109a4d 100644
--- a/src/transformers/modeling_ctrl.py
+++ b/src/transformers/modeling_ctrl.py
@@ -24,12 +24,14 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 
 from .configuration_ctrl import CTRLConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import Conv1D, PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "CTRLTokenizer"
+
 CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "ctrl"
     # See all CTRL models at https://huggingface.co/models?filter=ctrl
@@ -326,6 +328,7 @@ class CTRLModel(CTRLPreTrainedModel):
             self.h[layer].multi_head_attention.prune_heads(heads)
 
     @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
     def forward(
         self,
         input_ids=None,
@@ -358,20 +361,6 @@ class CTRLModel(CTRLPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import CTRLTokenizer, CTRLModel
-        import torch
-
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = CTRLModel.from_pretrained('ctrl')
-
-        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         use_cache = use_cache if use_cache is not None else self.config.use_cache
@@ -510,6 +499,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
         return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]}
 
     @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
     def forward(
         self,
         input_ids=None,
@@ -552,19 +542,6 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import torch
-        from transformers import CTRLTokenizer, CTRLLMHeadModel
-
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = CTRLLMHeadModel.from_pretrained('ctrl')
-
-        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
-
         """
         transformer_outputs = self.transformer(
             input_ids,
diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py
index b3ea7039cd..398b0e2958 100644
--- a/src/transformers/modeling_distilbert.py
+++ b/src/transformers/modeling_distilbert.py
@@ -30,12 +30,13 @@ from torch.nn import CrossEntropyLoss
 
 from .activations import gelu
 from .configuration_distilbert import DistilBertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "DistilBertTokenizer"
 
 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "distilbert-base-uncased",
@@ -409,6 +410,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
             self.transformer.layer[layer].attention.prune_heads(heads)
 
     @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
     def forward(
         self,
         input_ids=None,
@@ -434,20 +436,6 @@ class DistilBertModel(DistilBertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import DistilBertTokenizer, DistilBertModel
-        import torch
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertModel.from_pretrained('distilbert-base-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -506,6 +494,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
         return self.vocab_projector
 
     @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
     def forward(
         self,
         input_ids=None,
@@ -544,17 +533,6 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
 
-    Examples::
-
-        from transformers import DistilBertTokenizer, DistilBertForMaskedLM
-        import torch
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, prediction_scores = outputs[:2]
-
         """
         if "masked_lm_labels" in kwargs:
             warnings.warn(
@@ -604,6 +582,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
     def forward(
         self,
         input_ids=None,
@@ -639,18 +618,6 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
 
-    Examples::
-
-        from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
-        import torch
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
         """
         distilbert_output = self.distilbert(
             input_ids=input_ids,
@@ -697,6 +664,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
     def forward(
         self,
         input_ids=None,
@@ -737,20 +705,6 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
-        import torch
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss, start_scores, end_scores = outputs[:3]
-
         """
         distilbert_output = self.distilbert(
             input_ids=input_ids,
@@ -806,6 +760,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
     def forward(
         self,
         input_ids=None,
@@ -838,19 +793,6 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import DistilBertTokenizer, DistilBertForTokenClassification
-        import torch
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
         """
 
         outputs = self.distilbert(
@@ -940,22 +882,23 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
 
     Examples::
 
-        from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
-        import torch
+        >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
+        >>> import torch
 
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')
+        >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
+        >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')
 
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> choice0 = "It is eaten with a fork and a knife."
+        >>> choice1 = "It is eaten while held in the hand."
+        >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
 
-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
-        outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
+        >>> encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
+        >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
+
+        >>> # the linear classifier still needs to be trained
+        >>> loss, logits = outputs[:2]
 
-        # the linear classifier still needs to be trained
-        loss, logits = outputs[:2]
         """
         num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
 
diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py
index 74426cc89b..e08e487153 100644
--- a/src/transformers/modeling_electra.py
+++ b/src/transformers/modeling_electra.py
@@ -8,13 +8,14 @@ from torch.nn import CrossEntropyLoss, MSELoss
 
 from .activations import get_activation
 from .configuration_electra import ElectraConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_bert import BertEmbeddings, BertEncoder, BertLayerNorm, BertPreTrainedModel
 from .modeling_utils import SequenceSummary
 
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "ElectraTokenizer"
 
 ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "google/electra-small-generator",
@@ -264,6 +265,7 @@ class ElectraModel(ElectraPreTrainedModel):
             self.encoder.layer[layer].attention.prune_heads(heads)
 
     @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
     def forward(
         self,
         input_ids=None,
@@ -291,20 +293,6 @@ class ElectraModel(ElectraPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import ElectraModel, ElectraTokenizer
-        import torch
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = ElectraModel.from_pretrained('google/electra-small-discriminator')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -383,6 +371,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
     def forward(
         self,
         input_ids=None,
@@ -419,21 +408,6 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForSequenceClassification
-        import torch
-
-        tokenizer = ElectraTokenizer.from_pretrained('bert-base-uncased')
-        model = ElectraForSequenceClassification.from_pretrained('bert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, logits = outputs[:2]
-
         """
         discriminator_hidden_states = self.electra(
             input_ids,
@@ -521,16 +495,14 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
 
     Examples::
 
-        from transformers import ElectraTokenizer, ElectraForPreTraining
-        import torch
+        >>> from transformers import ElectraTokenizer, ElectraForPreTraining
+        >>> import torch
 
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+        >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+        >>> model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
 
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        prediction_scores, seq_relationship_scores = outputs[:2]
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> scores = model(input_ids)[0]
 
         """
 
@@ -589,6 +561,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
         return self.generator_lm_head
 
     @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator")
     def forward(
         self,
         input_ids=None,
@@ -628,20 +601,6 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-        Examples::
-
-            from transformers import ElectraTokenizer, ElectraForMaskedLM
-            import torch
-
-            tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
-            model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator')
-
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=input_ids)
-
-            loss, prediction_scores = outputs[:2]
-
         """
         if "masked_lm_labels" in kwargs:
             warnings.warn(
@@ -696,6 +655,7 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
     def forward(
         self,
         input_ids=None,
@@ -730,21 +690,6 @@ class ElectraForTokenClassification(ElectraPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import ElectraTokenizer, ElectraForTokenClassification
-        import torch
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = ElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, scores = outputs[:2]
-
         """
 
         discriminator_hidden_states = self.electra(
@@ -802,6 +747,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
     def forward(
         self,
         input_ids=None,
@@ -844,23 +790,6 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import ElectraTokenizer, ElectraForQuestionAnswering
-        import torch
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
-        model = ElectraForQuestionAnswering.from_pretrained('google/electra-base-discriminator')
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        encoding = tokenizer.encode_plus(question, text, return_tensors='pt')
-        input_ids, token_type_ids = encoding['input_ids'], encoding['token_type_ids']
-        start_scores, end_scores = model(input_ids, token_type_ids=token_type_ids)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze(0))
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-
         """
 
         discriminator_hidden_states = self.electra(
@@ -918,6 +847,7 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
     def forward(
         self,
         input_ids=None,
@@ -954,25 +884,6 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import ElectraTokenizer, ElectraForMultipleChoice
-        import torch
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
-        model = ElectraForMultipleChoice.from_pretrained('google/electra-base-discriminator')
-
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        labels = torch.tensor(0) # choice0 is correct (according to Wikipedia ;))
-
-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
-        outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
-
-        # the linear classifier still needs to be trained
-        loss, logits = outputs[:2]
         """
         num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
 
diff --git a/src/transformers/modeling_encoder_decoder.py b/src/transformers/modeling_encoder_decoder.py
index 668be50478..73954fdcb2 100644
--- a/src/transformers/modeling_encoder_decoder.py
+++ b/src/transformers/modeling_encoder_decoder.py
@@ -126,9 +126,8 @@ class EncoderDecoderModel(PreTrainedModel):
 
         Examples::
 
-            from transformers import EncoderDecoder
-
-            model = EncoderDecoder.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
+            >>> from transformers import EncoderDecoderModel
+            >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
         """
 
         kwargs_encoder = {
@@ -244,21 +243,21 @@ class EncoderDecoderModel(PreTrainedModel):
 
         Examples::
 
-            from transformers import EncoderDecoderModel, BertTokenizer
-            import torch
+            >>> from transformers import EncoderDecoderModel, BertTokenizer
+            >>> import torch
 
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
 
-            # forward
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
+            >>> # forward
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
 
-            # training
-            loss, outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, lm_labels=input_ids)[:2]
+            >>> # training
+            >>> loss, outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)[:2]
 
-            # generation
-            generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)
+            >>> # generation
+            >>> generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)
 
         """
 
diff --git a/src/transformers/modeling_flaubert.py b/src/transformers/modeling_flaubert.py
index b565d7dd92..1fed5b6853 100644
--- a/src/transformers/modeling_flaubert.py
+++ b/src/transformers/modeling_flaubert.py
@@ -22,7 +22,7 @@ import torch
 from torch.nn import functional as F
 
 from .configuration_flaubert import FlaubertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_xlm import (
     XLMForQuestionAnswering,
     XLMForQuestionAnsweringSimple,
@@ -35,6 +35,8 @@ from .modeling_xlm import (
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "FlaubertTokenizer"
+
 FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "flaubert/flaubert_small_cased",
     "flaubert/flaubert_base_uncased",
@@ -119,6 +121,7 @@ class FlaubertModel(XLMModel):
         self.pre_norm = getattr(config, "pre_norm", False)
 
     @add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="flaubert/flaubert_base_cased")
     def forward(
         self,
         input_ids=None,
@@ -149,18 +152,6 @@ class FlaubertModel(XLMModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import FlaubertTokenizer, FlaubertModel
-        import torch
-
-        tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased')
-        model = FlaubertModel.from_pretrained('flaubert-base-cased')
-        input_ids = torch.tensor(tokenizer.encode("Le chat mange une pomme.", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py
index 36bd0bc7b3..8bcb8876a9 100644
--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -26,7 +26,7 @@ from torch.nn import CrossEntropyLoss
 
 from .activations import ACT2FN
 from .configuration_gpt2 import GPT2Config
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import (
     Conv1D,
     PreTrainedModel,
@@ -38,6 +38,8 @@ from .modeling_utils import (
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
+
 GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "gpt2",
     "gpt2-medium",
@@ -370,6 +372,7 @@ class GPT2Model(GPT2PreTrainedModel):
             self.h[layer].attn.prune_heads(heads)
 
     @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
     def forward(
         self,
         input_ids=None,
@@ -403,18 +406,6 @@ class GPT2Model(GPT2PreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import GPT2Tokenizer, GPT2Model
-        import torch
-
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = GPT2Model.from_pretrained('gpt2')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -553,6 +544,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         return {"input_ids": input_ids, "past": past, "use_cache": kwargs["use_cache"]}
 
     @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
     def forward(
         self,
         input_ids=None,
@@ -595,19 +587,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import torch
-        from transformers import GPT2Tokenizer, GPT2LMHeadModel
-
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = GPT2LMHeadModel.from_pretrained('gpt2')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
-
         """
         transformer_outputs = self.transformer(
             input_ids,
@@ -721,26 +700,26 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
 
     Examples::
 
-        import torch
-        from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
+        >>> import torch
+        >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
 
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
+        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
 
-        # Add a [CLS] to the vocabulary (we should train it also!)
-        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
-        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
+        >>> # Add a [CLS] to the vocabulary (we should train it also!)
+        >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
 
-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        encoded_choices = [tokenizer.encode(s) for s in choices]
-        cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
 
-        input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
-        mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
+        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
 
-        outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
+        >>> mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
+
+        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
         """
         if "lm_labels" in kwargs:
diff --git a/src/transformers/modeling_longformer.py b/src/transformers/modeling_longformer.py
index 9b52794201..7d2a6978b5 100644
--- a/src/transformers/modeling_longformer.py
+++ b/src/transformers/modeling_longformer.py
@@ -24,13 +24,15 @@ from torch.nn import CrossEntropyLoss, MSELoss
 from torch.nn import functional as F
 
 from .configuration_longformer import LongformerConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_bert import BertPreTrainedModel
 from .modeling_roberta import RobertaLMHead, RobertaModel
 
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "LongformerTokenizer"
+
 LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "allenai/longformer-base-4096",
     "allenai/longformer-large-4096",
@@ -609,22 +611,22 @@ class LongformerModel(RobertaModel):
 
     Examples::
 
-        import torch
-        from transformers import LongformerModel, LongformerTokenizer
+        >>> import torch
+        >>> from transformers import LongformerModel, LongformerTokenizer
 
-        model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
-        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+        >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
+        >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
 
-        SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
-        input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
+        >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
+        >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
 
-        # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
-        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
-        attention_mask[:, [1, 4, 21,]] = 2  # Set global attention based on the task. For example,
-                                            # classification: the <s> token
-                                            # QA: question tokens
-                                            # LM: potentially on the beginning of sentences and paragraphs
-        sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask)
+        >>> # Attention mask values -- 0: no attention, 1: local attention, 2: global attention
+        >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
+        >>> attention_mask[:, [1, 4, 21,]] = 2  # Set global attention based on the task. For example,
+        ...                                     # classification: the <s> token
+        ...                                     # QA: question tokens
+        ...                                     # LM: potentially on the beginning of sentences and paragraphs
+        >>> sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask)
         """
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -743,18 +745,18 @@ class LongformerForMaskedLM(BertPreTrainedModel):
 
     Examples::
 
-        import torch
-        from transformers import LongformerForMaskedLM, LongformerTokenizer
+        >>> import torch
+        >>> from transformers import LongformerForMaskedLM, LongformerTokenizer
 
-        model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
-        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+        >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
+        >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
 
-        SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
-        input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
+        >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
+        >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
 
-        attention_mask = None  # default is local attention everywhere, which is a good choice for MaskedLM
-                               # check ``LongformerModel.forward`` for more details how to set `attention_mask`
-        loss, prediction_scores = model(input_ids, attention_mask=attention_mask, labels=input_ids)
+        >>> attention_mask = None  # default is local attention everywhere, which is a good choice for MaskedLM
+        ...                        # check ``LongformerModel.forward`` for more details how to set `attention_mask`
+        >>> loss, prediction_scores = model(input_ids, attention_mask=attention_mask, labels=input_ids)
         """
 
         if "masked_lm_labels" in kwargs:
@@ -807,6 +809,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
     def forward(
         self,
         input_ids=None,
@@ -843,19 +846,6 @@ class LongformerForSequenceClassification(BertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import LongformerTokenizer, LongformerForSequenceClassification
-        import torch
-
-        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
-        model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
         """
 
         if global_attention_mask is None:
@@ -973,25 +963,25 @@ class LongformerForQuestionAnswering(BertPreTrainedModel):
 
     Examples::
 
-        from transformers import LongformerTokenizer, LongformerForQuestionAnswering
-        import torch
+        >>> from transformers import LongformerTokenizer, LongformerForQuestionAnswering
+        >>> import torch
 
-        tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
-        model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
+        >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
+        >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
 
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
-        input_ids = encoding["input_ids"]
+        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        >>> encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
+        >>> input_ids = encoding["input_ids"]
 
-        # default is local attention everywhere
-        # the forward method will automatically set global attention on question tokens
-        attention_mask = encoding["attention_mask"]
+        >>> # default is local attention everywhere
+        >>> # the forward method will automatically set global attention on question tokens
+        >>> attention_mask = encoding["attention_mask"]
 
-        start_scores, end_scores = model(input_ids, attention_mask=attention_mask)
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
+        >>> start_scores, end_scores = model(input_ids, attention_mask=attention_mask)
+        >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
 
-        answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1]
-        answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
+        >>> answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1]
+        >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
 
         """
 
@@ -1060,6 +1050,7 @@ class LongformerForTokenClassification(BertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
     def forward(
         self,
         input_ids=None,
@@ -1094,19 +1085,6 @@ class LongformerForTokenClassification(BertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import LongformerTokenizer, LongformerForTokenClassification
-        import torch
-
-        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
-        model = LongformerForTokenClassification.from_pretrained('allenai/longformer-base-4096')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
         """
 
         outputs = self.longformer(
@@ -1163,6 +1141,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(LONGFORMER_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="allenai/longformer-base-4096")
     def forward(
         self,
         input_ids=None,
@@ -1200,23 +1179,6 @@ class LongformerForMultipleChoice(BertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import LongformerTokenizer, LongformerForMultipleChoice
-        import torch
-
-        tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
-        model = LongformerForMultipleChoice.from_pretrained('allenai/longformer-base-4096')
-        # context = "The dog is cute" | choice = "the dog" / "the cat"
-        choices = [("The dog is cute", "the dog"), ("The dog is cute", "the cat")]
-        input_ids = torch.tensor([tokenizer.encode(s[0], s[1], add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-
-        # global attention is automatically put on "the dog" and "the cat"
-        outputs = model(input_ids, labels=labels)
-        loss, classification_scores = outputs[:2]
-
         """
         num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
 
diff --git a/src/transformers/modeling_marian.py b/src/transformers/modeling_marian.py
index e8598ca1f6..1b4989ab1b 100644
--- a/src/transformers/modeling_marian.py
+++ b/src/transformers/modeling_marian.py
@@ -31,18 +31,18 @@ class MarianMTModel(BartForConditionalGeneration):
 
     Examples::
 
-        from transformers import MarianTokenizer, MarianMTModel
-        from typing import List
-        src = 'fr'  # source language
-        trg = 'en'  # target language
-        sample_text = "où est l'arrêt de bus ?"
-        mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
+        >>> from transformers import MarianTokenizer, MarianMTModel
+        >>> from typing import List
+        >>> src = 'fr'  # source language
+        >>> trg = 'en'  # target language
+        >>> sample_text = "où est l'arrêt de bus ?"
+        >>> mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
 
-        model = MarianMTModel.from_pretrained(mname)
-        tok = MarianTokenizer.from_pretrained(mname)
-        batch = tok.prepare_translation_batch(src_texts=[sample_text])  # don't need tgt_text for inference
-        gen = model.generate(**batch)  # for forward pass: model(**batch)
-        words: List[str] = tok.batch_decode(gen, skip_special_tokens=True)  # returns "Where is the the bus stop ?"
+        >>> model = MarianMTModel.from_pretrained(mname)
+        >>> tok = MarianTokenizer.from_pretrained(mname)
+        >>> batch = tok.prepare_translation_batch(src_texts=[sample_text])  # don't need tgt_text for inference
+        >>> gen = model.generate(**batch)  # for forward pass: model(**batch)
+        >>> words: List[str] = tok.batch_decode(gen, skip_special_tokens=True)  # returns "Where is the the bus stop ?"
 
     """
 
diff --git a/src/transformers/modeling_mobilebert.py b/src/transformers/modeling_mobilebert.py
index fa91d77b72..5165d3fa2b 100644
--- a/src/transformers/modeling_mobilebert.py
+++ b/src/transformers/modeling_mobilebert.py
@@ -34,11 +34,14 @@ from transformers.modeling_bert import BertIntermediate
 
 from .activations import gelu, gelu_new, swish
 from .configuration_mobilebert import MobileBertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 
 
 logger = logging.getLogger(__name__)
+
+_TOKENIZER_FOR_DOC = "MobileBertTokenizer"
+
 MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = ["google/mobilebert-uncased"]
 
 
@@ -747,6 +750,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
             self.encoder.layer[layer].attention.prune_heads(heads)
 
     @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
     def forward(
         self,
         input_ids=None,
@@ -785,20 +789,6 @@ class MobileBertModel(MobileBertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import MobileBertModel, MobileBertTokenizer
-        import torch
-
-        tokenizer = MobileBertTokenizer.from_pretrained(model_name_or_path)
-        model = MobileBertModel.from_pretrained(model_name_or_path)
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -951,13 +941,17 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
             heads.
 
     Examples::
-        from transformers import MobileBertTokenizer, MobileBertForPreTraining
-        import torch
-        tokenizer = MobileBertTokenizer.from_pretrained(model_name_or_path)
-        model = MobileBertForPreTraining.from_pretrained(model_name_or_path)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, seq_relationship_scores = outputs[:2]
+
+        >>> from transformers import MobileBertTokenizer, MobileBertForPreTraining
+        >>> import torch
+
+        >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
+        >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
+
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+
+        >>> prediction_scores, seq_relationship_scores = outputs[:2]
 
         """
         outputs = self.mobilebert(
@@ -1022,6 +1016,7 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
             self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
 
     @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
     def forward(
         self,
         input_ids=None,
@@ -1063,20 +1058,6 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-        Examples::
-
-            from transformers import MobileBertTokenizer, MobileBertForMaskedLM
-            import torch
-
-            tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-            model = MobileBertForMaskedLM.from_pretrained('mobilebert-uncased')
-
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, labels=input_ids)
-
-            loss, prediction_scores = outputs[:2]
-
         """
         if "masked_lm_labels" in kwargs:
             warnings.warn(
@@ -1174,18 +1155,17 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
 
     Examples::
 
-        from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
-        import torch
+        >>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
+        >>> import torch
 
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = MobileBertForNextSentencePrediction.from_pretrained('mobilebert-uncased')
+        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+        >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
 
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
 
-        loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
-        assert logits[0, 0] < logits[0, 1] # next sentence was random
+        >>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
         """
 
         outputs = self.mobilebert(
@@ -1228,6 +1208,7 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
     def forward(
         self,
         input_ids=None,
@@ -1263,20 +1244,6 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import BertTokenizer, BertForSequenceClassification
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, logits = outputs[:2]
         """
 
         outputs = self.mobilebert(
@@ -1321,6 +1288,7 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
     def forward(
         self,
         input_ids=None,
@@ -1363,25 +1331,6 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import MobileBertTokenizer, MobileBertForQuestionAnswering
-        import torch
-
-        tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
-        model = MobileBertForQuestionAnswering.from_pretrained(model_name_or_path)
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        encoding = tokenizer.encode_plus(question, text)
-        input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
-        start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-
-        assert answer == "a nice puppet"
-
         """
 
         outputs = self.mobilebert(
@@ -1439,6 +1388,7 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
     def forward(
         self,
         input_ids=None,
@@ -1476,25 +1426,6 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import MobileBertTokenizer, MobileBertForMultipleChoice
-        import torch
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = MobileBertForMultipleChoice.from_pretrained('mobilebert-uncased')
-
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        choice0 = "It is eaten with a fork and a knife."
-        choice1 = "It is eaten while held in the hand."
-        labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-
-        encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True)
-        outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
-
-        # the linear classifier still needs to be trained
-        loss, logits = outputs[:2]
         """
         num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
 
@@ -1552,6 +1483,7 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
     def forward(
         self,
         input_ids=None,
@@ -1586,21 +1518,6 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import MobileBertTokenizer, MobileBertForTokenClassification
-        import torch
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = MobileBertForTokenClassification.from_pretrained('mobilebert-uncased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        loss, scores = outputs[:2]
-
         """
 
         outputs = self.mobilebert(
diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py
index f8643c047c..949a6ccd7a 100644
--- a/src/transformers/modeling_openai.py
+++ b/src/transformers/modeling_openai.py
@@ -28,7 +28,7 @@ from torch.nn import CrossEntropyLoss
 
 from .activations import gelu_new, swish
 from .configuration_openai import OpenAIGPTConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import (
     Conv1D,
     PreTrainedModel,
@@ -40,6 +40,8 @@ from .modeling_utils import (
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
+
 OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "openai-gpt",
     # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt
@@ -356,6 +358,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
             self.h[layer].attn.prune_heads(heads)
 
     @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
     def forward(
         self,
         input_ids=None,
@@ -383,18 +386,6 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
-        import torch
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = OpenAIGPTModel.from_pretrained('openai-gpt')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -490,6 +481,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         return self.lm_head
 
     @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
     def forward(
         self,
         input_ids=None,
@@ -531,18 +523,6 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
-        import torch
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
-
     """
         transformer_outputs = self.transformer(
             input_ids,
diff --git a/src/transformers/modeling_reformer.py b/src/transformers/modeling_reformer.py
index 664edb111b..7b763ebf96 100644
--- a/src/transformers/modeling_reformer.py
+++ b/src/transformers/modeling_reformer.py
@@ -29,12 +29,20 @@ from torch.nn import CrossEntropyLoss
 
 from .activations import gelu, gelu_fast, gelu_new, swish
 from .configuration_reformer import ReformerConfig
-from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_utils import PreTrainedModel, apply_chunking_to_forward
 
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "ReformerTokenizer"
+
 REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "google/reformer-crime-and-punishment",
     "google/reformer-enwik8",
@@ -1543,6 +1551,7 @@ class ReformerModel(ReformerPreTrainedModel):
             self.encoder.layer[layer].attention.prune_heads(heads)
 
     @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment")
     def forward(
         self,
         input_ids=None,
@@ -1570,19 +1579,6 @@ class ReformerModel(ReformerPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import ReformerModel, ReformerTokenizer
-        import torch
-
-        tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
-        model =  ReformerModel.from_pretrained('google/reformer-crime-and-punishment')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
         """
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1738,6 +1734,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
         pass
 
     @add_start_docstrings_to_callable(REFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment")
     def forward(
         self,
         input_ids=None,
@@ -1774,19 +1771,6 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import ReformerModelWithLMHead, ReformerTokenizer
-        import torch
-
-        tokenizer = ReformerTokenizer.from_pretrained('google/reformer-crime-and-punishment')
-        model =  ReformerModelWithLMHead.from_pretrained('google/reformer-crime-and-punishment')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-
-        loss, prediction_scores = outputs[:2]
         """
 
         reformer_outputs = self.reformer(
diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py
index 13452b46ae..7c3f08294d 100644
--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -24,12 +24,14 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .configuration_roberta import RobertaConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
 
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "RobertaTokenizer"
+
 ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "roberta-base",
     "roberta-large",
@@ -177,6 +179,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
         return self.lm_head.decoder
 
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
     def forward(
         self,
         input_ids=None,
@@ -216,18 +219,6 @@ class RobertaForMaskedLM(BertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import RobertaTokenizer, RobertaForMaskedLM
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForMaskedLM.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, prediction_scores = outputs[:2]
-
         """
         if "masked_lm_labels" in kwargs:
             warnings.warn(
@@ -304,6 +295,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
     def forward(
         self,
         input_ids=None,
@@ -340,19 +332,6 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import RobertaTokenizer, RobertaForSequenceClassification
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForSequenceClassification.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
         """
         outputs = self.roberta(
             input_ids,
@@ -400,6 +379,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
     def forward(
         self,
         input_ids=None,
@@ -437,20 +417,6 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import RobertaTokenizer, RobertaForMultipleChoice
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForMultipleChoice.from_pretrained('roberta-base')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-        input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, classification_scores = outputs[:2]
-
         """
         num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
 
@@ -510,6 +476,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
     def forward(
         self,
         input_ids=None,
@@ -544,19 +511,6 @@ class RobertaForTokenClassification(BertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import RobertaTokenizer, RobertaForTokenClassification
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForTokenClassification.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
         """
 
         outputs = self.roberta(
@@ -632,6 +586,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
     def forward(
         self,
         input_ids=None,
@@ -674,25 +629,6 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        # The checkpoint roberta-large is not fine-tuned for question answering. Please see the
-        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
-
-        from transformers import RobertaTokenizer, RobertaForQuestionAnswering
-        import torch
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = RobertaForQuestionAnswering.from_pretrained('roberta-base')
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_ids = tokenizer.encode(question, text)
-        start_scores, end_scores = model(torch.tensor([input_ids]))
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-
         """
 
         outputs = self.roberta(
diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py
index 572b8382bc..ce044efe6c 100644
--- a/src/transformers/modeling_t5.py
+++ b/src/transformers/modeling_t5.py
@@ -33,6 +33,8 @@ from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, p
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "T5Tokenizer"
+
 ####################################################
 # This dict contrains shortcut names and associated url
 # for the pretrained weights provided with the models
@@ -924,16 +926,17 @@ class T5Model(T5PreTrainedModel):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
 
-    Examples::
+        Example::
 
-        from transformers import T5Tokenizer, T5Model
+            >>> from transformers import T5Tokenizer, T5Model
 
-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = T5Model.from_pretrained('t5-small')
-        input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> model = T5Model.from_pretrained('t5-small')
 
+            >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+            >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
+
+            >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
         """
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
@@ -1068,18 +1071,18 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
 
     Examples::
 
-        from transformers import T5Tokenizer, T5ForConditionalGeneration
+        >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
 
-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = T5ForConditionalGeneration.from_pretrained('t5-small')
-        input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
-        loss, prediction_scores = outputs[:2]
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
+        >>> loss, prediction_scores = outputs[:2]
 
-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = T5ForConditionalGeneration.from_pretrained('t5-small')
-        input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt")  # Batch size 1
-        outputs = model.generate(input_ids)
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt")  # Batch size 1
+        >>> outputs = model.generate(input_ids)
         """
 
         if "lm_labels" in kwargs:
diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py
index 1f038455e8..9f988c647e 100644
--- a/src/transformers/modeling_tf_albert.py
+++ b/src/transformers/modeling_tf_albert.py
@@ -21,7 +21,12 @@ import logging
 import tensorflow as tf
 
 from .configuration_albert import AlbertConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
 from .modeling_tf_utils import (
     TFMultipleChoiceLoss,
@@ -39,6 +44,8 @@ from .tokenization_utils import BatchEncoding
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "AlbertTokenizer"
+
 TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "albert-base-v1",
     "albert-large-v1",
@@ -713,6 +720,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
         self.albert = TFAlbertMainLayer(config, name="albert")
 
     @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
     def call(self, inputs, **kwargs):
         r"""
     Returns:
@@ -737,18 +745,6 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertModel
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertModel.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
         outputs = self.albert(inputs, **kwargs)
         return outputs
@@ -837,6 +833,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
         return self.albert.embeddings
 
     @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
     def call(self, inputs, **kwargs):
         r"""
     Returns:
@@ -854,18 +851,6 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForMaskedLM
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForMaskedLM.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
         """
         outputs = self.albert(inputs, **kwargs)
 
@@ -895,6 +880,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
         )
 
     @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
     def call(
         self,
         inputs=None,
@@ -930,19 +916,6 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             labels = inputs[8] if len(inputs) > 8 else labels
@@ -994,6 +967,7 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
         )
 
     @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
     def call(
         self,
         inputs=None,
@@ -1027,19 +1001,6 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForTokenClassification
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForTokenClassification.from_pretrained('albert-base-v2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             labels = inputs[8] if len(inputs) > 8 else labels
@@ -1089,6 +1050,7 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
         )
 
     @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
     def call(
         self,
         inputs=None,
@@ -1130,24 +1092,6 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        # The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the
-        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForQuestionAnswering
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForQuestionAnswering.from_pretrained('albert-base-v2')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
         """
         if isinstance(inputs, (tuple, list)):
             start_positions = inputs[8] if len(inputs) > 8 else start_positions
@@ -1213,6 +1157,7 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
         return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
 
     @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
     def call(
         self,
         inputs,
@@ -1249,22 +1194,6 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import AlbertTokenizer, TFAlbertForMultipleChoice
-
-        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        model = TFAlbertForMultipleChoice.from_pretrained('albert-base-v2')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tokenizer(choices, add_special_tokens=True, return_tensors='tf', truncation=True, padding=True)[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py
index 6676b98869..55431d1fbb 100644
--- a/src/transformers/modeling_tf_bert.py
+++ b/src/transformers/modeling_tf_bert.py
@@ -22,7 +22,12 @@ import numpy as np
 import tensorflow as tf
 
 from .configuration_bert import BertConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_utils import (
     TFMultipleChoiceLoss,
     TFPreTrainedModel,
@@ -39,6 +44,7 @@ from .tokenization_utils import BatchEncoding
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "BertTokenizer"
 
 TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "bert-base-uncased",
@@ -704,6 +710,7 @@ class TFBertModel(TFBertPreTrainedModel):
         self.bert = TFBertMainLayer(config, name="bert")
 
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
     def call(self, inputs, **kwargs):
         r"""
     Returns:
@@ -728,18 +735,6 @@ class TFBertModel(TFBertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertModel
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertModel.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
         """
         outputs = self.bert(inputs, **kwargs)
         return outputs
@@ -819,6 +814,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
         return self.bert.embeddings
 
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
     def call(self, inputs, **kwargs):
         r"""
     Return:
@@ -836,18 +832,6 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForMaskedLM
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
         """
         outputs = self.bert(inputs, **kwargs)
 
@@ -930,6 +914,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
         )
 
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
     def call(
         self,
         inputs=None,
@@ -965,19 +950,6 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForSequenceClassification
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             labels = inputs[8] if len(inputs) > 8 else labels
@@ -1037,6 +1009,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
         return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
 
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
     def call(
         self,
         inputs,
@@ -1073,22 +1046,6 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForMultipleChoice
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
@@ -1177,6 +1134,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
         )
 
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
     def call(
         self,
         inputs=None,
@@ -1210,19 +1168,6 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForTokenClassification
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             labels = inputs[8] if len(inputs) > 8 else labels
@@ -1273,6 +1218,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
         )
 
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
     def call(
         self,
         inputs=None,
@@ -1314,22 +1260,6 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss)
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import BertTokenizer, TFBertForQuestionAnswering
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-        assert answer == "a nice puppet"
-
         """
         if isinstance(inputs, (tuple, list)):
             start_positions = inputs[8] if len(inputs) > 8 else start_positions
diff --git a/src/transformers/modeling_tf_ctrl.py b/src/transformers/modeling_tf_ctrl.py
index 5c93b89eb8..4bc7cf0910 100644
--- a/src/transformers/modeling_tf_ctrl.py
+++ b/src/transformers/modeling_tf_ctrl.py
@@ -22,7 +22,7 @@ import numpy as np
 import tensorflow as tf
 
 from .configuration_ctrl import CTRLConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_utils import (
     TFPreTrainedModel,
     TFSharedEmbeddings,
@@ -35,6 +35,8 @@ from .tokenization_utils import BatchEncoding
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "CtrlTokenizer"
+
 TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "ctrl"
     # See all CTRL models at https://huggingface.co/models?filter=ctrl
@@ -489,6 +491,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
         self.transformer = TFCTRLMainLayer(config, name="transformer")
 
     @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
     def call(self, inputs, **kwargs):
         r"""
     Return:
@@ -510,18 +513,6 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import CTRLTokenizer, TFCTRLModel
-
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = TFCTRLModel.from_pretrained('ctrl')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
         outputs = self.transformer(inputs, **kwargs)
         return outputs
@@ -569,6 +560,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
         return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]}
 
     @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
     def call(self, inputs, **kwargs):
         r"""
     Return:
@@ -590,19 +582,6 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import CTRLTokenizer, TFCTRLLMHeadModel
-
-        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
-        model = TFCTRLLMHeadModel.from_pretrained('ctrl')
-
-        input_ids = tf.constant([tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)])
-        outputs = model(input_ids)
-        loss, logits = outputs[:2]
-
         """
         transformer_outputs = self.transformer(inputs, **kwargs)
         hidden_states = transformer_outputs[0]
diff --git a/src/transformers/modeling_tf_distilbert.py b/src/transformers/modeling_tf_distilbert.py
index 2007a7519d..bf06335b20 100644
--- a/src/transformers/modeling_tf_distilbert.py
+++ b/src/transformers/modeling_tf_distilbert.py
@@ -23,7 +23,12 @@ import numpy as np
 import tensorflow as tf
 
 from .configuration_distilbert import DistilBertConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_utils import (
     TFMultipleChoiceLoss,
     TFPreTrainedModel,
@@ -41,6 +46,7 @@ from .tokenization_utils import BatchEncoding
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "DistilBertTokenizer"
 
 TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "distilbert-base-uncased",
@@ -575,6 +581,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
         self.distilbert = TFDistilBertMainLayer(config, name="distilbert")  # Embeddings
 
     @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
     def call(self, inputs, **kwargs):
         r"""
     Returns:
@@ -592,17 +599,6 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertModel
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = TFDistilBertModel.from_pretrained('distilbert-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
         """
         outputs = self.distilbert(inputs, **kwargs)
         return outputs
@@ -647,6 +643,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
         return self.vocab_projector.input_embeddings
 
     @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
     def call(self, inputs, **kwargs):
         r"""
 
@@ -665,18 +662,6 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
         """
         distilbert_output = self.distilbert(inputs, **kwargs)
 
@@ -713,6 +698,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
         self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
 
     @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
     def call(
         self,
         inputs=None,
@@ -746,19 +732,6 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             labels = inputs[6] if len(inputs) > 6 else labels
@@ -809,6 +782,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
         )
 
     @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
     def call(
         self,
         inputs=None,
@@ -840,19 +814,6 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForTokenClassification
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             labels = inputs[6] if len(inputs) > 6 else labels
@@ -916,6 +877,7 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
         return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
 
     @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
     def call(
         self,
         inputs,
@@ -950,22 +912,6 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForMultipleChoice
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-        model = TFDistilBertForMultipleChoice.from_pretrained('distilbert-base-uncased')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
@@ -1046,6 +992,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
         self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
 
     @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
     def call(
         self,
         inputs=None,
@@ -1085,21 +1032,6 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import DistilBertTokenizer, TFDistilBertForQuestionAnswering
-
-        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
         """
         if isinstance(inputs, (tuple, list)):
             start_positions = inputs[6] if len(inputs) > 6 else start_positions
diff --git a/src/transformers/modeling_tf_electra.py b/src/transformers/modeling_tf_electra.py
index 833987c52d..1aae20b56f 100644
--- a/src/transformers/modeling_tf_electra.py
+++ b/src/transformers/modeling_tf_electra.py
@@ -4,7 +4,7 @@ import tensorflow as tf
 
 from transformers import ElectraConfig
 
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel
 from .modeling_tf_utils import (
     TFQuestionAnsweringLoss,
@@ -18,6 +18,7 @@ from .tokenization_utils import BatchEncoding
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "ElectraTokenizer"
 
 TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "google/electra-small-generator",
@@ -383,6 +384,7 @@ class TFElectraModel(TFElectraPreTrainedModel):
         self.electra = TFElectraMainLayer(config, name="electra")
 
     @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
     def call(self, inputs, **kwargs):
         r"""
     Returns:
@@ -400,17 +402,6 @@ class TFElectraModel(TFElectraPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import ElectraTokenizer, TFElectraModel
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = TFElectraModel.from_pretrained('google/electra-small-discriminator')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
         """
         outputs = self.electra(inputs, **kwargs)
         return outputs
@@ -532,6 +523,7 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
         return self.generator_lm_head
 
     @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-generator")
     def call(
         self,
         input_ids=None,
@@ -560,18 +552,6 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import ElectraTokenizer, TFElectraForMaskedLM
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
-        model = TFElectraForMaskedLM.from_pretrained('google/electra-small-generator')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
         """
 
         generator_hidden_states = self.electra(
@@ -611,6 +591,7 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
         )
 
     @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
     def call(
         self,
         inputs=None,
@@ -644,19 +625,6 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import ElectraTokenizer, TFElectraForTokenClassification
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        model = TFElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             labels = inputs[8] if len(inputs) > 8 else labels
@@ -705,6 +673,7 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
         )
 
     @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/electra-small-discriminator")
     def call(
         self,
         inputs=None,
@@ -746,22 +715,6 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import ElectraTokenizer, TFElectraForQuestionAnswering
-
-        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
-        model = TFElectraForQuestionAnswering.from_pretrained('google/electra-small-generator')
-
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
         """
         if isinstance(inputs, (tuple, list)):
             start_positions = inputs[8] if len(inputs) > 8 else start_positions
diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py
index bf83f9d953..f2bc63392a 100644
--- a/src/transformers/modeling_tf_gpt2.py
+++ b/src/transformers/modeling_tf_gpt2.py
@@ -22,7 +22,7 @@ import numpy as np
 import tensorflow as tf
 
 from .configuration_gpt2 import GPT2Config
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_utils import (
     TFConv1D,
     TFPreTrainedModel,
@@ -38,6 +38,8 @@ from .tokenization_utils import BatchEncoding
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
+
 TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "gpt2",
     "gpt2-medium",
@@ -490,6 +492,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
         self.transformer = TFGPT2MainLayer(config, name="transformer")
 
     @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
     def call(self, inputs, **kwargs):
         r"""
     Return:
@@ -511,18 +514,6 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import GPT2Tokenizer, TFGPT2Model
-
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = TFGPT2Model.from_pretrained('gpt2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
     """
         outputs = self.transformer(inputs, **kwargs)
         return outputs
@@ -549,6 +540,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
         return {"inputs": inputs, "past": past, "use_cache": kwargs["use_cache"]}
 
     @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
     def call(self, inputs, **kwargs):
         r"""
     Return:
@@ -570,19 +562,6 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
-
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = TFGPT2LMHeadModel.from_pretrained('gpt2')
-
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
-
         """
         transformer_outputs = self.transformer(inputs, **kwargs)
         hidden_states = transformer_outputs[0]
@@ -659,29 +638,26 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
 
     Examples::
 
-        # For example purposes. Not runnable.
-        import tensorflow as tf
-        from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
+        >>> import tensorflow as tf
+        >>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
 
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
+        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
 
-        # Add a [CLS] to the vocabulary (we should train it also!)
-        # This option is currently not implemented in TF 2.0
-        raise NotImplementedError
-        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
-        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
+        >>> # Add a [CLS] to the vocabulary (we should train it also!)
+        >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
 
-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        encoded_choices = [tokenizer.encode(s) for s in choices]
-        cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
 
-        input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
-        mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
+        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
 
-        outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        >>> input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
+        >>> mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1
+
+        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
         """
         if isinstance(inputs, (tuple, list)):
diff --git a/src/transformers/modeling_tf_mobilebert.py b/src/transformers/modeling_tf_mobilebert.py
index 7a77f7cc1f..3178bccfaf 100644
--- a/src/transformers/modeling_tf_mobilebert.py
+++ b/src/transformers/modeling_tf_mobilebert.py
@@ -21,7 +21,12 @@ import logging
 import tensorflow as tf
 
 from . import MobileBertConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish
 from .modeling_tf_utils import (
     TFMultipleChoiceLoss,
@@ -39,6 +44,7 @@ from .tokenization_utils import BatchEncoding
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "MobileBertTokenizer"
 
 TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "mobilebert-uncased",
@@ -621,19 +627,6 @@ class TFMobileBertMLMHead(tf.keras.layers.Layer):
         return prediction_scores
 
 
-class TFMobileBertPreTrainingHeads(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.predictions = TFMobileBertLMPredictionHead(config, name="predictions")
-        self.seq_relationship = tf.keras.layers.Dense(2, name="seq_relationship")
-
-    def call(self, inputs):
-        sequence_output, pooled_output = inputs
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
 @keras_serializable
 class TFMobileBertMainLayer(tf.keras.layers.Layer):
     config_class = MobileBertConfig
@@ -845,6 +838,7 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):
         self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
 
     @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
     def call(self, inputs, **kwargs):
         r"""
     Returns:
@@ -869,18 +863,6 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertModel
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertModel.from_pretrained('mobilebert-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
         """
         outputs = self.mobilebert(inputs, **kwargs)
         return outputs
@@ -895,7 +877,8 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
-        self.cls = TFMobileBertPreTrainingHeads(config, name="cls")
+        self.predictions = TFMobileBertMLMHead(config, name="predictions___cls")
+        self.seq_relationship = TFMobileBertOnlyNSPHead(2, name="seq_relationship___cls")
 
     def get_output_embeddings(self):
         return self.mobilebert.embeddings
@@ -923,20 +906,21 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
 
     Examples::
 
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
+        >>> import tensorflow as tf
+        >>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
 
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForPreTraining.from_pretrained('mobilebert-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, seq_relationship_scores = outputs[:2]
+        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+        >>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased')
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> prediction_scores, seq_relationship_scores = outputs[:2]
 
         """
         outputs = self.mobilebert(inputs, **kwargs)
 
         sequence_output, pooled_output = outputs[:2]
-        prediction_scores, seq_relationship_score = self.cls([sequence_output, pooled_output])
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
         outputs = (prediction_scores, seq_relationship_score,) + outputs[
             2:
         ]  # add hidden states and attention if they are here
@@ -956,6 +940,7 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):
         return self.mobilebert.embeddings
 
     @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
     def call(self, inputs, **kwargs):
         r"""
     Return:
@@ -973,18 +958,6 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForMaskedLM
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForMaskedLM.from_pretrained('mobilebert-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
         """
         outputs = self.mobilebert(inputs, **kwargs)
 
@@ -1015,7 +988,7 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
         super().__init__(config, *inputs, **kwargs)
 
         self.mobilebert = TFMobileBertMainLayer(config, name="mobilebert")
-        self.cls = TFMobileBertOnlyNSPHead(config, name="cls")
+        self.cls = TFMobileBertOnlyNSPHead(config, name="seq_relationship___cls")
 
     @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     def call(self, inputs, **kwargs):
@@ -1038,18 +1011,17 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel):
 
     Examples::
 
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
+        >>> import tensorflow as tf
+        >>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
 
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForNextSentencePrediction.from_pretrained('mobilebert-uncased')
+        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+        >>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
 
-        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='tf')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
 
-        logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
-        assert logits[0][0] < logits[0][1] # the next sentence was random
+        >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
         """
         outputs = self.mobilebert(inputs, **kwargs)
 
@@ -1078,6 +1050,7 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
         )
 
     @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
     def call(
         self,
         inputs=None,
@@ -1113,19 +1086,6 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFBMobileBertForSequenceClassification
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForSequenceClassification.from_pretrained('mobilebert-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             labels = inputs[8] if len(inputs) > 8 else labels
@@ -1176,6 +1136,7 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
         )
 
     @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
     def call(
         self,
         inputs=None,
@@ -1217,22 +1178,6 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForQuestionAnswering
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForQuestionAnswering.from_pretrained('mobilebert-uncased')  # Not a fine-tuned model! Load a fine-tuned model to obtain coherent results.
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-        assert answer == "a nice puppet"
-
         """
         if isinstance(inputs, (tuple, list)):
             start_positions = inputs[8] if len(inputs) > 8 else start_positions
@@ -1298,6 +1243,7 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
         return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
 
     @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
     def call(
         self,
         inputs,
@@ -1334,22 +1280,6 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForMultipleChoice
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForMultipleChoice.from_pretrained('mobilebert-uncased')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
@@ -1438,6 +1368,7 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
         )
 
     @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
     def call(
         self,
         inputs=None,
@@ -1471,19 +1402,6 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import MobileBertTokenizer, TFMobileBertForTokenClassification
-
-        tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
-        model = TFMobileBertForTokenClassification.from_pretrained('mobilebert-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             labels = inputs[8] if len(inputs) > 8 else labels
diff --git a/src/transformers/modeling_tf_openai.py b/src/transformers/modeling_tf_openai.py
index 89ab8f2f1d..c254e32751 100644
--- a/src/transformers/modeling_tf_openai.py
+++ b/src/transformers/modeling_tf_openai.py
@@ -22,7 +22,7 @@ import numpy as np
 import tensorflow as tf
 
 from .configuration_openai import OpenAIGPTConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_utils import (
     TFConv1D,
     TFPreTrainedModel,
@@ -38,6 +38,8 @@ from .tokenization_utils import BatchEncoding
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "OpenAIGPTTokenizer"
+
 TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "openai-gpt",
     # See all OpenAI GPT models at https://huggingface.co/models?filter=openai-gpt
@@ -449,6 +451,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
         self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
 
     @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
     def call(self, inputs, **kwargs):
         r"""
     Return:
@@ -466,18 +469,6 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTModel
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = TFOpenAIGPTModel.from_pretrained('openai-gpt')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
         outputs = self.transformer(inputs, **kwargs)
         return outputs
@@ -497,6 +488,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
         return self.transformer.tokens_embed
 
     @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
     def call(self, inputs, **kwargs):
         r"""
     Return:
@@ -514,18 +506,6 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel
-
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
-
         """
         transformer_outputs = self.transformer(inputs, **kwargs)
         hidden_states = transformer_outputs[0]
@@ -601,26 +581,23 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
 
     Examples::
 
-        # For example purposes. Not runnable.
-        import tensorflow as tf
-        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
+        >>> import tensorflow as tf
+        >>> from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
 
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        >>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
 
-        # Add a [CLS] to the vocabulary (we should train it also!)
-        # This option is currently not implemented in TF 2.0
-        raise NotImplementedError
-        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
-        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
-        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
-
-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :]  # Batch size 1, 2 choices
-        mc_token_ids = tf.constant([input_ids.size(-1), input_ids.size(-1)])[None, :]  # Batch size 1
-        outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        >>> # Add a [CLS] to the vocabulary (we should train it also!)
+        >>> tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+        >>> model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
+        >>> print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
 
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        >>> encoding = tokenizer(choices, return_tensors="tf")
+        >>> inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()}
+        >>> inputs["mc_token_ids"]= tf.constant([inputs["input_ids"].shape[-1] - 1, inputs["input_ids"].shape[-1] - 1])[None, :]  # Batch size 1
+        >>> outputs = model(inputs)
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
         """
 
         if isinstance(inputs, (tuple, list)):
@@ -633,7 +610,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
             mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
             output_attentions = inputs[7] if len(inputs) > 7 else output_attentions
             assert len(inputs) <= 8, "Too many inputs."
-        elif isinstance(inputs, dict):
+        elif isinstance(inputs, (dict, BatchEncoding)):
             input_ids = inputs.get("input_ids")
             attention_mask = inputs.get("attention_mask", attention_mask)
             token_type_ids = inputs.get("token_type_ids", token_type_ids)
diff --git a/src/transformers/modeling_tf_roberta.py b/src/transformers/modeling_tf_roberta.py
index 66c1f6b8e4..751ca17abc 100644
--- a/src/transformers/modeling_tf_roberta.py
+++ b/src/transformers/modeling_tf_roberta.py
@@ -21,7 +21,12 @@ import logging
 import tensorflow as tf
 
 from .configuration_roberta import RobertaConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu
 from .modeling_tf_utils import (
     TFMultipleChoiceLoss,
@@ -38,6 +43,8 @@ from .tokenization_utils_base import BatchEncoding
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "RobertaTokenizer"
+
 TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "roberta-base",
     "roberta-large",
@@ -195,6 +202,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
         self.roberta = TFRobertaMainLayer(config, name="roberta")
 
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
     def call(self, inputs, **kwargs):
         r"""
     Returns:
@@ -219,18 +227,6 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaModel
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaModel.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
         outputs = self.roberta(inputs, **kwargs)
         return outputs
@@ -279,6 +275,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
         return self.lm_head.decoder
 
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
     def call(self, inputs, **kwargs):
         r"""
     Return:
@@ -296,18 +293,6 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForMaskedLM
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores = outputs[0]
-
         """
         outputs = self.roberta(inputs, **kwargs)
 
@@ -358,6 +343,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
         self.classifier = TFRobertaClassificationHead(config, name="classifier")
 
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
     def call(
         self,
         inputs=None,
@@ -387,19 +373,6 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             labels = inputs[8] if len(inputs) > 8 else labels
@@ -441,7 +414,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
-        self.roberta = TFBertMainLayer(config, name="roberta")
+        self.roberta = TFRobertaMainLayer(config, name="roberta")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
         self.classifier = tf.keras.layers.Dense(
             1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
@@ -457,6 +430,7 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
         return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
 
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
     def call(
         self,
         inputs,
@@ -493,22 +467,6 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss)
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForMultipleChoice
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForMultipleChoice.from_pretrained('roberta-base')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
@@ -592,6 +550,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
         )
 
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
     def call(
         self,
         inputs=None,
@@ -625,19 +584,6 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForTokenClassification
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForTokenClassification.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             labels = inputs[8] if len(inputs) > 8 else labels
@@ -687,6 +633,7 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
         )
 
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
     def call(
         self,
         inputs=None,
@@ -728,24 +675,6 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        # The checkpoint roberta-base is not fine-tuned for question answering. Please see the
-        # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task.
-
-        import tensorflow as tf
-        from transformers import RobertaTokenizer, TFRobertaForQuestionAnswering
-
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        model = TFRobertaForQuestionAnswering.from_pretrained('roberta-base')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
         """
         if isinstance(inputs, (tuple, list)):
             start_positions = inputs[8] if len(inputs) > 8 else start_positions
diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py
index 959ceadd8c..1898397b4c 100644
--- a/src/transformers/modeling_tf_t5.py
+++ b/src/transformers/modeling_tf_t5.py
@@ -37,6 +37,8 @@ from .tokenization_utils import BatchEncoding
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "T5Tokenizer"
+
 TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "t5-small",
     "t5-base",
@@ -931,13 +933,13 @@ class TFT5Model(TFT5PreTrainedModel):
 
     Examples::
 
-        from transformers import T5Tokenizer, TFT5Model
+        >>> from transformers import T5Tokenizer, TFT5Model
 
-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = TFT5Model.from_pretrained('t5-small')
-        inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        outputs = model(inputs, decoder_input_ids=inputs)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = TFT5Model.from_pretrained('t5-small')
+        >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
+        >>> outputs = model(inputs, decoder_input_ids=inputs)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
         """
 
@@ -1074,18 +1076,18 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel):
 
     Examples::
 
-        from transformers import T5Tokenizer, TFT5ForConditionalGeneration
+        >>> from transformers import T5Tokenizer, TFT5ForConditionalGeneration
 
-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
-        inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        outputs = model(inputs, decoder_input_ids=inputs)
-        prediction_scores = outputs[0]
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
+        >>> outputs = model(inputs, decoder_input_ids=inputs)
+        >>> prediction_scores = outputs[0]
 
-        tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
-        inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        model.generate(inputs)
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> inputs = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="tf")  # Batch size 1
+        >>> result = model.generate(inputs)
 
         """
 
diff --git a/src/transformers/modeling_tf_transfo_xl.py b/src/transformers/modeling_tf_transfo_xl.py
index 63921c4368..ae9accb255 100644
--- a/src/transformers/modeling_tf_transfo_xl.py
+++ b/src/transformers/modeling_tf_transfo_xl.py
@@ -22,7 +22,7 @@ import logging
 import tensorflow as tf
 
 from .configuration_transfo_xl import TransfoXLConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
 from .modeling_tf_utils import (
     TFPreTrainedModel,
@@ -36,6 +36,8 @@ from .tokenization_utils import BatchEncoding
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
+
 TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "transfo-xl-wt103",
     # See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl
@@ -722,6 +724,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
         self.transformer = TFTransfoXLMainLayer(config, name="transformer")
 
     @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
     def call(self, inputs, **kwargs):
         r"""
     Return:
@@ -743,18 +746,6 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import TransfoXLTokenizer, TFTransfoXLModel
-
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TFTransfoXLModel.from_pretrained('transfo-xl-wt103')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states, mems = outputs[:2]
-
         """
         outputs = self.transformer(inputs, **kwargs)
         return outputs
@@ -811,6 +802,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
         return self.transformer.init_mems(bsz)
 
     @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
     def call(
         self,
         inputs,
@@ -842,18 +834,6 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import TransfoXLTokenizer, TFTransfoXLLMHeadModel
-
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TFTransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, mems = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
@@ -863,7 +843,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
             labels = inputs[4] if len(inputs) > 4 else labels
             output_attentions = inputs[5] if len(inputs) > 5 else output_attentions
             assert len(inputs) <= 6, "Too many inputs."
-        elif isinstance(inputs, dict):
+        elif isinstance(inputs, (BatchEncoding, dict)):
             input_ids = inputs.get("input_ids")
             mems = inputs.get("mems", mems)
             head_mask = inputs.get("head_mask", head_mask)
diff --git a/src/transformers/modeling_tf_xlm.py b/src/transformers/modeling_tf_xlm.py
index 54416b2e6d..007bb572e6 100644
--- a/src/transformers/modeling_tf_xlm.py
+++ b/src/transformers/modeling_tf_xlm.py
@@ -24,7 +24,12 @@ import numpy as np
 import tensorflow as tf
 
 from .configuration_xlm import XLMConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_utils import (
     TFMultipleChoiceLoss,
     TFPreTrainedModel,
@@ -43,6 +48,8 @@ from .tokenization_utils import BatchEncoding
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "XLMTokenizer"
+
 TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "xlm-mlm-en-2048",
     "xlm-mlm-ende-1024",
@@ -608,6 +615,7 @@ class TFXLMModel(TFXLMPreTrainedModel):
         self.transformer = TFXLMMainLayer(config, name="transformer")
 
     @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
     def call(self, inputs, **kwargs):
         r"""
     Return:
@@ -625,18 +633,6 @@ class TFXLMModel(TFXLMPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMModel
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
         outputs = self.transformer(inputs, **kwargs)
         return outputs
@@ -704,6 +700,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
         return {"inputs": inputs, "langs": langs}
 
     @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
     def call(self, inputs, **kwargs):
         r"""
     Return:
@@ -721,18 +718,6 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMWithLMHeadModel
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
         transformer_outputs = self.transformer(inputs, **kwargs)
 
@@ -757,6 +742,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
         self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
 
     @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
     def call(
         self,
         inputs=None,
@@ -795,19 +781,6 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMForSequenceClassification
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             labels = inputs[11] if len(inputs) > 11 else labels
@@ -865,6 +838,7 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
         return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
 
     @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
     def call(
         self,
         inputs,
@@ -876,9 +850,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
         cache=None,
         head_mask=None,
         inputs_embeds=None,
-        labels=None,
         output_attentions=None,
         output_hidden_states=None,
+        labels=None,
         training=False,
     ):
         r"""
@@ -904,22 +878,6 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMForMultipleChoice
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMForMultipleChoice.from_pretrained('xlm-mlm-en-2048')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
@@ -932,7 +890,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
             head_mask = inputs[7] if len(inputs) > 7 else head_mask
             inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
             output_attentions = inputs[9] if len(inputs) > 9 else output_attentions
-            assert len(inputs) <= 10, "Too many inputs."
+            output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states
+            labels = inputs[11] if len(inputs) > 11 else labels
+            assert len(inputs) <= 11, "Too many inputs."
         elif isinstance(inputs, (dict, BatchEncoding)):
             input_ids = inputs.get("input_ids")
             attention_mask = inputs.get("attention_mask", attention_mask)
@@ -944,7 +904,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
             head_mask = inputs.get("head_mask", head_mask)
             inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             output_attentions = inputs.get("output_attentions", output_attentions)
-            assert len(inputs) <= 10, "Too many inputs."
+            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
+            labels = inputs.get("labels", labels)
+            assert len(inputs) <= 12, "Too many inputs."
         else:
             input_ids = inputs
 
@@ -1001,13 +963,14 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
         self.transformer = TFXLMMainLayer(config, name="transformer")
         self.dropout = tf.keras.layers.Dropout(config.dropout)
         self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
         )
 
     @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
     def call(
         self,
-        input_ids=None,
+        inputs=None,
         attention_mask=None,
         langs=None,
         token_type_ids=None,
@@ -1016,9 +979,9 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
         cache=None,
         head_mask=None,
         inputs_embeds=None,
-        labels=None,
         output_attentions=None,
         output_hidden_states=None,
+        labels=None,
         training=False,
     ):
         r"""
@@ -1041,25 +1004,22 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMForTokenClassification
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMForTokenClassification.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
         """
+        if isinstance(inputs, (tuple, list)):
+            labels = inputs[11] if len(inputs) > 11 else labels
+            if len(inputs) > 11:
+                inputs = inputs[:11]
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            labels = inputs.pop("labels", labels)
+
         transformer_outputs = self.transformer(
-            input_ids,
+            inputs,
             attention_mask=attention_mask,
+            langs=langs,
             token_type_ids=token_type_ids,
             position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
             head_mask=head_mask,
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
@@ -1072,7 +1032,7 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos
         sequence_output = self.dropout(sequence_output, training=training)
         logits = self.classifier(sequence_output)
 
-        outputs = (logits,) + transformer_outputs[2:]  # add hidden states and attention if they are here
+        outputs = (logits,) + transformer_outputs[1:]  # add hidden states and attention if they are here
 
         if labels is not None:
             loss = self.compute_loss(labels, logits)
@@ -1095,6 +1055,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
         )
 
     @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
     def call(
         self,
         inputs=None,
@@ -1139,21 +1100,6 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
         """
         if isinstance(inputs, (tuple, list)):
             start_positions = inputs[11] if len(inputs) > 11 else start_positions
diff --git a/src/transformers/modeling_tf_xlnet.py b/src/transformers/modeling_tf_xlnet.py
index db5b4e840d..80ee28fc78 100644
--- a/src/transformers/modeling_tf_xlnet.py
+++ b/src/transformers/modeling_tf_xlnet.py
@@ -23,7 +23,12 @@ import numpy as np
 import tensorflow as tf
 
 from .configuration_xlnet import XLNetConfig
-from .file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+)
 from .modeling_tf_utils import (
     TFMultipleChoiceLoss,
     TFPreTrainedModel,
@@ -42,6 +47,8 @@ from .tokenization_utils import BatchEncoding
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "XLNetTokenizer"
+
 TF_XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "xlnet-base-cased",
     "xlnet-large-cased",
@@ -832,6 +839,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
         self.transformer = TFXLNetMainLayer(config, name="transformer")
 
     @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
     def call(self, inputs, **kwargs):
         r"""
     Return:
@@ -853,18 +861,6 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetModel
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = TFXLNetModel.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
         outputs = self.transformer(inputs, **kwargs)
         return outputs
@@ -949,10 +945,13 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
 
         # We show how to setup inputs to predict a next token using a bi-directional context.
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :]  # We will predict the masked token
+
         perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
         perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+
         target_mapping = np.zeros((1, 1, input_ids.shape[1]))  # Shape [1, 1, seq_length] => let's predict one token
         target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+
         outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32))
 
         next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
@@ -986,6 +985,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
         )
 
     @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
     def call(
         self,
         inputs=None,
@@ -1029,19 +1029,6 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetForSequenceClassification
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             labels = inputs[12] if len(inputs) > 12 else labels
@@ -1105,6 +1092,7 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
         return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
 
     @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
     def call(
         self,
         inputs=None,
@@ -1145,22 +1133,6 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetForMultipleChoice
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = TFXLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-
-        input_ids = tf.constant([tokenizer.encode(s, add_special_tokens=True) for s in choices])[None, :] # Batch size 1, 2 choices
-        labels = tf.reshape(tf.constant(1), (-1, 1))
-        outputs = model(input_ids, labels=labels)
-
-        loss, classification_scores = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             input_ids = inputs[0]
@@ -1257,6 +1229,8 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
             config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
         )
 
+    @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
     def call(
         self,
         inputs=None,
@@ -1298,19 +1272,6 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetForTokenClassification
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = TFXLNetForTokenClassification.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        labels = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
         """
         if isinstance(inputs, (tuple, list)):
             labels = inputs[12] if len(inputs) > 12 else labels
@@ -1361,6 +1322,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
         )
 
     @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
     def call(
         self,
         inputs=None,
@@ -1412,21 +1374,6 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XLNetTokenizer, TFXLNetForQuestionAnsweringSimple
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
-        start_scores, end_scores = model(input_dict)
-
-        all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-        answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
-
         """
         if isinstance(inputs, (tuple, list)):
             start_positions = inputs[12] if len(inputs) > 12 else start_positions
diff --git a/src/transformers/modeling_transfo_xl.py b/src/transformers/modeling_transfo_xl.py
index 6b2eb2c8e5..2e39ef025c 100644
--- a/src/transformers/modeling_transfo_xl.py
+++ b/src/transformers/modeling_transfo_xl.py
@@ -27,13 +27,15 @@ import torch.nn as nn
 import torch.nn.functional as F
 
 from .configuration_transfo_xl import TransfoXLConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax
 from .modeling_utils import PreTrainedModel
 
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "TransfoXLTokenizer"
+
 TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "transfo-xl-wt103",
     # See all Transformer XL models at https://huggingface.co/models?filter=transfo-xl
@@ -749,6 +751,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         return new_mems
 
     @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
     def forward(
         self,
         input_ids=None,
@@ -778,18 +781,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import TransfoXLTokenizer, TransfoXLModel
-        import torch
-
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states, mems = outputs[:2]
-
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -945,6 +936,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         return self.transformer.init_mems(bsz)
 
     @add_start_docstrings_to_callable(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="transfo-xl-wt103")
     def forward(
         self,
         input_ids=None,
@@ -984,18 +976,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
-        import torch
-
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, mems = outputs[:2]
-
         """
         if input_ids is not None:
             bsz, tgt_len = input_ids.size(0), input_ids.size(1)
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index befb577317..f1797df96a 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -978,13 +978,15 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
 
         Examples::
 
+            from transformers import AutoTokenizer, AutoModelForCausalLM
+
             tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
+            model = AutoModelForCausalLM.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
             outputs = model.generate(max_length=40)  # do greedy decoding
             print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
 
             tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
+            model = AutoModelForCausalLM.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
             input_context = 'The dog'
             input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
             outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
@@ -992,22 +994,22 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
                 print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
 
             tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
+            model = AutoModelForCausalLM.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
             input_context = 'The dog'
             input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3)  # 3 generate sequences using by sampling
+            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True)  # 3 generate sequences using by sampling
             for i in range(3): #  3 output sequences were generated
                 print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
 
             tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
+            model = AutoModelForCausalLM.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
             input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
             input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
             outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
             print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
 
             tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
+            model = AutoModelForCausalLM.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
             input_context = 'My cute dog'  # "Legal" is one of the control codes for ctrl
             bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
             input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py
index 714bca06cb..03a1ebe237 100644
--- a/src/transformers/modeling_xlm.py
+++ b/src/transformers/modeling_xlm.py
@@ -28,7 +28,7 @@ from torch.nn import functional as F
 
 from .activations import gelu
 from .configuration_xlm import XLMConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import (
     PreTrainedModel,
     SequenceSummary,
@@ -40,6 +40,8 @@ from .modeling_utils import (
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "XLMTokenizer"
+
 XLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "xlm-mlm-en-2048",
     "xlm-mlm-ende-1024",
@@ -395,6 +397,7 @@ class XLMModel(XLMPreTrainedModel):
             self.attentions[layer].prune_heads(heads)
 
     @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
     def forward(
         self,
         input_ids=None,
@@ -425,18 +428,6 @@ class XLMModel(XLMPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMModel
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -632,6 +623,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
         return {"input_ids": input_ids, "langs": langs}
 
     @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
     def forward(
         self,
         input_ids=None,
@@ -672,18 +664,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMWithLMHeadModel
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
         transformer_outputs = self.transformer(
             input_ids,
@@ -722,6 +702,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
     def forward(
         self,
         input_ids=None,
@@ -761,19 +742,6 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMForSequenceClassification
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
         """
         transformer_outputs = self.transformer(
             input_ids,
@@ -822,6 +790,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
     def forward(
         self,
         input_ids=None,
@@ -867,20 +836,6 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMForQuestionAnsweringSimple
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss = outputs[0]
-
         """
         transformer_outputs = self.transformer(
             input_ids,
@@ -1006,19 +961,20 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
 
-    Examples::
+    Example::
 
-        from transformers import XLMTokenizer, XLMForQuestionAnswering
-        import torch
+        >>> from transformers import XLMTokenizer, XLMForQuestionAnswering
+        >>> import torch
 
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss = outputs[0]
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
 
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+
+        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        >>> loss = outputs[0]
         """
         transformer_outputs = self.transformer(
             input_ids,
@@ -1067,6 +1023,7 @@ class XLMForTokenClassification(XLMPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048")
     def forward(
         self,
         input_ids=None,
@@ -1074,6 +1031,8 @@ class XLMForTokenClassification(XLMPreTrainedModel):
         langs=None,
         token_type_ids=None,
         position_ids=None,
+        lengths=None,
+        cache=None,
         head_mask=None,
         labels=None,
         output_attentions=None,
@@ -1101,19 +1060,6 @@ class XLMForTokenClassification(XLMPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import XLMTokenizer, XLMForTokenClassification
-        import torch
-
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280')
-        model = XLMForTokenClassification.from_pretrained('xlm-mlm-100-1280')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-
         """
         outputs = self.transformer(
             input_ids,
@@ -1121,6 +1067,8 @@ class XLMForTokenClassification(XLMPreTrainedModel):
             langs=langs,
             token_type_ids=token_type_ids,
             position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py
index a70916b341..f6dcd679eb 100644
--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -26,12 +26,14 @@ from torch.nn import functional as F
 
 from .activations import gelu_new, swish
 from .configuration_xlnet import XLNetConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, PreTrainedModel, SequenceSummary
 
 
 logger = logging.getLogger(__name__)
 
+_TOKENIZER_FOR_DOC = "XLNetTokenizer"
+
 XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "xlnet-base-cased",
     "xlnet-large-cased",
@@ -749,6 +751,7 @@ class XLNetModel(XLNetPreTrainedModel):
         return pos_emb
 
     @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
     def forward(
         self,
         input_ids=None,
@@ -785,20 +788,6 @@ class XLNetModel(XLNetPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetModel
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetModel.from_pretrained('xlnet-large-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=False)).unsqueeze(0)  # Batch size 1
-
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1164,6 +1153,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
     def forward(
         self,
         input_ids=None,
@@ -1208,20 +1198,6 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForSequenceClassification
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-
         """
         transformer_outputs = self.transformer(
             input_ids,
@@ -1273,6 +1249,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
     def forward(
         self,
         input_ids=None,
@@ -1316,21 +1293,6 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForTokenClassification
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetForTokenClassification.from_pretrained('xlnet-large-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-
-        scores = outputs[0]
-
         """
 
         outputs = self.transformer(
@@ -1386,6 +1348,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
     def forward(
         self,
         input_ids=None,
@@ -1431,22 +1394,6 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForMultipleChoice
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
-
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-
-        outputs = model(input_ids, labels=labels)
-        loss, classification_scores = outputs[:2]
-
         """
         num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
 
@@ -1508,6 +1455,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
         self.init_weights()
 
     @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlnet-base-cased")
     def forward(
         self,
         input_ids=None,
@@ -1558,22 +1506,6 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
-
-    Examples::
-
-        from transformers import XLNetTokenizer, XLNetForQuestionAnsweringSimple
-        import torch
-
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = XLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
-
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss = outputs[0]
-
         """
 
         outputs = self.transformer(
@@ -1705,20 +1637,20 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
 
-    Examples::
+        Example::
 
-        from transformers import XLNetTokenizer, XLNetForQuestionAnswering
-        import torch
+            >>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
+            >>> import torch
 
-        tokenizer =  XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')
+            >>> tokenizer =  XLNetTokenizer.from_pretrained('xlnet-base-cased')
+            >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')
 
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss = outputs[0]
+            >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            >>> start_positions = torch.tensor([1])
+            >>> end_positions = torch.tensor([3])
+            >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
 
+            >>> loss = outputs[0]
         """
         transformer_outputs = self.transformer(
             input_ids,
diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py
index 19538cef37..78d5a1474f 100644
--- a/src/transformers/tokenization_bart.py
+++ b/src/transformers/tokenization_bart.py
@@ -66,13 +66,15 @@ class MBartTokenizer(XLMRobertaTokenizer):
     The tokenization method is <tokens> <eos> <language code>. There is no BOS token.
 
     Examples::
-        from transformers import MBartTokenizer
-        tokenizer = MBartTokenizer.from_pretrained('mbart-large-en-ro')
-        example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
-        expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-        batch: dict = tokenizer.prepare_translation_batch(
-            example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian
-        )
+
+        >>> from transformers import MBartTokenizer
+        >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-en-ro')
+        >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
+        >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
+        >>> batch: dict = tokenizer.prepare_translation_batch(
+        ...     example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian
+        ... )
+
     """
 
     vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}
diff --git a/src/transformers/tokenization_marian.py b/src/transformers/tokenization_marian.py
index 4d307cf978..fb0d327a20 100644
--- a/src/transformers/tokenization_marian.py
+++ b/src/transformers/tokenization_marian.py
@@ -25,13 +25,13 @@ class MarianTokenizer(PreTrainedTokenizer):
 
     Examples::
 
-        from transformers import MarianTokenizer
-        tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
-        src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
-        tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
-        batch_enc: BatchEncoding = tok.prepare_translation_batch(src_texts, tgt_texts=tgt_texts)
-        # keys  [input_ids, attention_mask, decoder_input_ids,  decoder_attention_mask].
-        # model(**batch) should work
+        >>> from transformers import MarianTokenizer
+        >>> tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+        >>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
+        >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
+        >>> batch_enc: BatchEncoding = tok.prepare_translation_batch(src_texts, tgt_texts=tgt_texts)
+        >>> # keys  [input_ids, attention_mask, decoder_input_ids,  decoder_attention_mask].
+        >>> # model(**batch) should work
     """
 
     vocab_files_names = vocab_files_names
diff --git a/src/transformers/tokenization_reformer.py b/src/transformers/tokenization_reformer.py
index 4accdcc3cf..6d9a57cc09 100644
--- a/src/transformers/tokenization_reformer.py
+++ b/src/transformers/tokenization_reformer.py
@@ -81,6 +81,7 @@ class ReformerTokenizer(PreTrainedTokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]
 
     def __init__(
         self,
diff --git a/src/transformers/tokenization_t5.py b/src/transformers/tokenization_t5.py
index df25eab1dd..8007d39c6b 100644
--- a/src/transformers/tokenization_t5.py
+++ b/src/transformers/tokenization_t5.py
@@ -94,6 +94,7 @@ class T5Tokenizer(PreTrainedTokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]
 
     def __init__(
         self,
diff --git a/tests/test_doc_samples.py b/tests/test_doc_samples.py
index 9861b2b550..a88c2ca5fa 100644
--- a/tests/test_doc_samples.py
+++ b/tests/test_doc_samples.py
@@ -13,52 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import doctest
+import logging
 import os
 import unittest
+from pathlib import Path
 from typing import List, Union
 
+import transformers
+
 from .utils import require_tf, require_torch, slow
 
 
-def get_examples_from_file(file):
-    examples = []
-    example = []
-    example_mode = False
-    example_indentation = None
-    for i, line in enumerate(file):
-        if example_mode:
-            current_indentation = len(line) - len(line.strip()) - 1
-
-            # Check if the indentation is 0 for the example, so that we don't exit as soon as there's a line return.
-            empty_line = example_indentation == 0 and len(line) == 1
-
-            # If we're back to the example indentation or if it's the end of the docstring.
-            if (current_indentation == example_indentation and not empty_line) or '"""' in line:
-                # Exit the example mode and add the example to the examples list
-                example_mode = False
-                example_indentation = None
-                examples.append(example)
-                example = []
-            else:
-                # If line is not empty, add it to the current example
-                if line != "\n":
-                    example.append(line[example_indentation + 4 : -1])
-
-        # Detect the example from '::' or 'example::'
-        if "example::" in line.lower():
-            example_mode = True
-            example_indentation = line.lower().find("example::")
-        elif "examples::" in line.lower():
-            example_mode = True
-            example_indentation = line.lower().find("examples::")
-        # elif "::" in line.lower() and len(line.strip()) == 2:
-        #     example_mode = True
-        #     example_indentation = line.lower().find("::")
-
-    examples = ["\n".join(example) for example in examples]
-    examples = [example for example in examples if "not runnable" not in example.lower()]
-
-    return examples
+logger = logging.getLogger()
 
 
 @require_torch
@@ -66,68 +33,81 @@ def get_examples_from_file(file):
 @slow
 class TestCodeExamples(unittest.TestCase):
     def analyze_directory(
-        self, directory: str, identifier: Union[str, None] = None, ignore_files: Union[List[str], None] = None
+        self,
+        directory: Path,
+        identifier: Union[str, None] = None,
+        ignore_files: Union[List[str], None] = [],
+        n_identifier: Union[str, None] = None,
+        only_modules: bool = True,
     ):
+        """
+        Runs through the specific directory, looking for the files identified with `identifier`. Executes
+        the doctests in those files
+
+        Args:
+            directory (:obj:`str`): Directory containing the files
+            identifier (:obj:`str`): Will parse files containing this
+            ignore_files (:obj:`List[str]`): List of files to skip
+            n_identifier (:obj:`str` or :obj:`List[str]`): Will not parse files containing this/these identifiers.
+            only_modules (:obj:`bool`): Whether to only analyze modules
+        """
         files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]
 
         if identifier is not None:
             files = [file for file in files if identifier in file]
 
-        if ignore_files is not None:
-            files = [file for file in files if file not in ignore_files]
+        if n_identifier is not None:
+            if isinstance(n_identifier, List):
+                for n_ in n_identifier:
+                    files = [file for file in files if n_ not in file]
+            else:
+                files = [file for file in files if n_identifier not in file]
+
+        ignore_files.append("__init__.py")
+        files = [file for file in files if file not in ignore_files]
 
         for file in files:
             # Open all files
-            print("Testing", file, end=" ")
-            with open(os.path.join(directory, file)) as f:
-                # Retrieve examples
-                examples = get_examples_from_file(f)
-                joined_examples = []
+            print("Testing", file)
 
-                def execute_example(code_example):
-                    exec(code_example, {})
-
-                # Some examples are the continuation of others.
-                if len(examples) > 0:
-                    joined_examples.append(examples[0])
-                    joined_examples_index = 0
-                    for example in examples[1:]:
-                        # If they contain this line, then they're a continuation of the previous script
-                        if "# Continuation of the previous script" in example:
-                            joined_examples[joined_examples_index] += "\n" + example
-                        # If not, create a new example and increment the index
-                        else:
-                            joined_examples.append(example)
-                            joined_examples_index += 1
-
-                print(str(len(joined_examples)) + "/" + str(len(joined_examples)))
-
-                # Execute sub tests with every example.
-                for index, code_example in enumerate(joined_examples):
-                    with self.subTest(msg=file + " " + str(index) + "/" + str(len(joined_examples)) + code_example):
-                        execute_example(code_example)
-
-    def test_configuration_examples(self):
-        transformers_directory = "src/transformers"
-        configuration_files = "configuration"
-        ignore_files = ["configuration_auto.py", "configuration_utils.py"]
-        self.analyze_directory(transformers_directory, identifier=configuration_files, ignore_files=ignore_files)
-
-    def test_main_doc_examples(self):
-        doc_directory = "docs/source"
-        ignore_files = ["favicon.ico"]
-        self.analyze_directory(doc_directory, ignore_files=ignore_files)
+            if only_modules:
+                try:
+                    module_identifier = file.split(".")[0]
+                    module_identifier = getattr(transformers, module_identifier)
+                    suite = doctest.DocTestSuite(module_identifier)
+                    result = unittest.TextTestRunner().run(suite)
+                    self.assertIs(len(result.failures), 0)
+                except AttributeError:
+                    logger.info(f"{module_identifier} is not a module.")
+            else:
+                result = doctest.testfile(str(".." / directory / file), optionflags=doctest.ELLIPSIS)
+                self.assertIs(result.failed, 0)
 
     def test_modeling_examples(self):
         transformers_directory = "src/transformers"
-        modeling_files = "modeling"
+        files = "modeling"
         ignore_files = [
-            "modeling_auto.py",
-            "modeling_t5.py",
-            "modeling_tf_auto.py",
-            "modeling_utils.py",
-            "modeling_tf_t5.py",
-            "modeling_bart.py",
-            "modeling_tf_utils.py",
+            "modeling_ctrl.py",
+            "modeling_tf_ctrl.py",
         ]
-        self.analyze_directory(transformers_directory, identifier=modeling_files, ignore_files=ignore_files)
+        self.analyze_directory(transformers_directory, identifier=files, ignore_files=ignore_files)
+
+    def test_tokenization_examples(self):
+        transformers_directory = Path("src/transformers")
+        files = "tokenization"
+        self.analyze_directory(transformers_directory, identifier=files)
+
+    def test_configuration_examples(self):
+        transformers_directory = Path("src/transformers")
+        files = "configuration"
+        self.analyze_directory(transformers_directory, identifier=files)
+
+    def test_remaining_examples(self):
+        transformers_directory = Path("src/transformers")
+        n_identifiers = ["configuration", "modeling", "tokenization"]
+        self.analyze_directory(transformers_directory, n_identifier=n_identifiers)
+
+    def test_doc_sources(self):
+        doc_source_directory = Path("docs/source")
+        ignore_files = ["favicon.ico"]
+        self.analyze_directory(doc_source_directory, ignore_files=ignore_files, only_modules=False)
diff --git a/tests/test_modeling_tf_xlm.py b/tests/test_modeling_tf_xlm.py
index c608219aed..acaadaf344 100644
--- a/tests/test_modeling_tf_xlm.py
+++ b/tests/test_modeling_tf_xlm.py
@@ -31,6 +31,7 @@ if is_tf_available():
         TFXLMWithLMHeadModel,
         TFXLMForSequenceClassification,
         TFXLMForQuestionAnsweringSimple,
+        TFXLMForTokenClassification,
         TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
     )
 
@@ -219,6 +220,26 @@ class TFXLMModelTester:
 
         self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size])
 
+    def create_and_check_xlm_for_token_classification(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_lengths,
+        sequence_labels,
+        token_labels,
+        is_impossible_labels,
+        input_mask,
+    ):
+        config.num_labels = self.num_labels
+        model = TFXLMForTokenClassification(config=config)
+        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        (logits,) = model(inputs)
+        result = {
+            "logits": logits.numpy(),
+        }
+        self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels])
+
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -244,7 +265,14 @@ class TFXLMModelTester:
 class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
-        (TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple)
+        # TODO The multiple choice model is missing and should be added.
+        (
+            TFXLMModel,
+            TFXLMWithLMHeadModel,
+            TFXLMForSequenceClassification,
+            TFXLMForQuestionAnsweringSimple,
+            TFXLMForTokenClassification,
+        )
         if is_tf_available()
         else ()
     )
@@ -275,6 +303,10 @@ class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
 
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
+
     @slow
     def test_model_from_pretrained(self):
         for model_name in TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: