Release: v3.0.0

[seq2seq docs] Move evaluation down, fix typo (#5365 )
[Docs] Benchmark docs (#5360 )
2020-06-29 10:40:13 -04:00 · 2020-06-29 10:36:04 -04:00 · 2020-06-29 16:08:57 +02:00 · 2020-06-29 09:51:13 -04:00 · 2020-06-29 09:05:08 -04:00 · 2020-06-29 09:02:33 -04:00
683 changed files with 76148 additions and 22173 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -12,9 +12,11 @@ jobs:
            - checkout
            - run: sudo pip install .[sklearn,tf-cpu,torch,testing]
            - run: sudo pip install codecov pytest-cov
-            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
+            - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ --cov  | tee output.txt
            - run: codecov
-
+            - store_artifacts:
+                  path: ~/transformers/output.txt
+                  destination: test_output.txt
    run_tests_torch:
        working_directory: ~/transformers
        docker:
@@ -26,9 +28,11 @@ jobs:
        steps:
            - checkout
            - run: sudo pip install .[sklearn,torch,testing]
-            - run: sudo pip install codecov pytest-cov
-            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
-            - run: codecov
+            - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ | tee output.txt
+            - store_artifacts:
+                  path: ~/transformers/output.txt
+                  destination: test_output.txt
+
    run_tests_tf:
        working_directory: ~/transformers
        docker:
@@ -40,9 +44,10 @@ jobs:
        steps:
            - checkout
            - run: sudo pip install .[sklearn,tf-cpu,testing]
-            - run: sudo pip install codecov pytest-cov
-            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
-            - run: codecov
+            - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ | tee output.txt
+            - store_artifacts:
+               path: ~/transformers/output.txt
+               destination: test_output.txt
    run_tests_custom_tokenizers:
        working_directory: ~/transformers
        docker:
@@ -65,7 +70,20 @@ jobs:
            - checkout
            - run: sudo pip install .[sklearn,torch,testing]
            - run: sudo pip install -r examples/requirements.txt
-            - run: python -m pytest -n 8 --dist=loadfile -s -v ./examples/
+            - run: python -m pytest -n 8 --dist=loadfile -s ./examples/ | tee output.txt
+            - store_artifacts:
+                  path: ~/transformers/output.txt
+                  destination: test_output.txt
+    build_doc:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        steps:
+            - checkout
+            - run: sudo pip install .[tf,torch,docs]
+            - run: cd docs && make html SPHINXOPTS="-W"
+            - store_artifacts:
+                path: ./docs/_build
    deploy_doc:
        working_directory: ~/transformers
        docker:
@@ -117,4 +135,5 @@ workflows:
            - run_tests_torch_and_tf
            - run_tests_torch
            - run_tests_tf
+            - build_doc
            - deploy_doc: *workflow_filters
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -5,19 +5,31 @@ function deploy_doc(){
 	git checkout $1
 	if [ ! -z "$2" ]
 	then
-		if [ -d "$dir/$2" ]; then
+		if [ "$2" == "master" ]; then
+		    echo "Pushing master"
+			make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir/$2/
+			cp -r _build/html/_static .
+		elif ssh -oStrictHostKeyChecking=no $doc "[ -d $dir/$2 ]"; then
 			echo "Directory" $2 "already exists"
+			scp -r -oStrictHostKeyChecking=no _static/* $doc:$dir/$2/_static/
 		else
 			echo "Pushing version" $2
-			make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
+			make clean && make html
+			rm -rf _build/html/_static
+			cp -r _static _build/html
+			scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
 		fi
 	else
-		echo "Pushing master"
-		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
+		echo "Pushing stable"
+		make clean && make html
+		rm -rf _build/html/_static
+		cp -r _static _build/html
+		scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
 	fi
 }

-deploy_doc "master"
+# You can find the commit for each tag on https://github.com/huggingface/transformers/tags
+deploy_doc "master" master
 deploy_doc "b33a385" v1.0.0
 deploy_doc "fe02e45" v1.1.0
 deploy_doc "89fd345" v1.2.0
@@ -27,3 +39,11 @@ deploy_doc "3616209" v2.2.0
 deploy_doc "d0f8b9a" v2.3.0
 deploy_doc "6664ea9" v2.4.0
 deploy_doc "fb560dc" v2.5.0
+deploy_doc "b90745c" v2.5.1
+deploy_doc "fbc5bf1" v2.6.0
+deploy_doc "6f5a12a" v2.7.0
+deploy_doc "11c3257" v2.8.0
+deploy_doc "e7cfc1a" v2.9.0
+deploy_doc "7cb203f" v2.9.1
+deploy_doc "10d7239" v2.10.0 
+deploy_doc "b42586e" #v2.11.0 Latest stable release
--- a/.github/ISSUE_TEMPLATE/--new-model-addition.md
+++ b/.github/ISSUE_TEMPLATE/--new-model-addition.md
@@ -2,7 +2,7 @@
 name: "\U0001F31F New model addition"
 about: Submit a proposal/request to implement a new Transformer-based model
 title: ''
-labels: ''
+labels: New model
 assignees: ''

 ---
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -40,7 +40,7 @@ Steps to reproduce the behavior:
 <!-- A clear and concise description of what you would expect to happen. -->

 ## Environment info
-<!-- You can run the command `python transformers-cli env` and copy-and-paste its output below.
+<!-- You can run the command `transformers-cli env` and copy-and-paste its output below.
     Don't forget to fill out the missing fields in that output! -->
     
 - `transformers` version:
--- a/.github/ISSUE_TEMPLATE/migration.md
+++ b/.github/ISSUE_TEMPLATE/migration.md
@@ -1,8 +1,9 @@
 ---
 name: "\U0001F4DA Migration from pytorch-pretrained-bert or pytorch-transformers"
-about: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers to transformers
+about: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers
+  to transformers
 title: ''
-labels: ''
+labels: Migration
 assignees: ''

 ---
--- a/.github/ISSUE_TEMPLATE/question-help.md
+++ b/.github/ISSUE_TEMPLATE/question-help.md
@@ -26,4 +26,4 @@ assignees: ''

 <!-- You should first ask your question on SO, and only if
     you didn't get an answer ask it here on GitHub. -->
-**A link to original question on Stack Overflow**: 
+**A link to original question on Stack Overflow**:
--- a/.github/workflows/github-push.yml
+++ b/.github/workflows/github-push.yml
@@ -11,9 +11,9 @@ jobs:
      uses: actions/setup-python@v1
      with:
        python-version: 3.7
-    - name: Install dependencies
-      run: |
-        pip install .[tf,torch,quality]
+    # - name: Install dependencies
+    #   run: |
+    #     pip install .[tf,torch,quality]



--- a/.github/workflows/github-torch-hub.yml
+++ b/.github/workflows/github-torch-hub.yml
@@ -0,0 +1,32 @@
+name: Torch hub integration
+
+on: 
+  push:
+    branches:
+      - "*"
+
+jobs:
+  torch_hub_integration:
+    runs-on: ubuntu-latest
+    steps:
+    # no checkout necessary here.
+    - name: Extract branch name
+      run: echo "::set-env name=BRANCH::${GITHUB_REF#refs/heads/}"
+    - name: Check branch name
+      run: echo $BRANCH
+    - name: Set up Python
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.7
+    - name: Install dependencies
+      run: |
+        pip install torch
+        pip install numpy tokenizers filelock requests tqdm regex sentencepiece sacremoses packaging
+
+    - name: Torch hub list
+      run: |
+        python -c "import torch; print(torch.hub.list('huggingface/transformers:$BRANCH'))"
+
+    - name: Torch hub help
+      run: |
+        python -c "import torch; print(torch.hub.help('huggingface/transformers:$BRANCH', 'modelForSequenceClassification'))"
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -1,9 +1,13 @@
 name: Self-hosted runner (push)

 on: 
-  # push:
-  #   branches:
-  #     - master
+  push:
+    branches:
+      - master
+    paths: 
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
  # pull_request:
  repository_dispatch:

@@ -31,8 +35,8 @@ jobs:
    - name: Install dependencies
      run: |
        source .env/bin/activate
-        pip install .[sklearn,tf,torch,testing]
-        pip uninstall -y tensorflow
+        pip install torch
+        pip install .[sklearn,testing]

    - name: Are GPUs recognized by our DL frameworks
      run: |
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -31,13 +31,12 @@ jobs:
    - name: Install dependencies
      run: |
        source .env/bin/activate
-        pip install .[sklearn,tf,torch,testing]
+        pip install .[sklearn,torch,testing]

    - name: Are GPUs recognized by our DL frameworks
      run: |
        source .env/bin/activate
        python -c "import torch; print(torch.cuda.is_available())"
-        python -c "import tensorflow as tf; print(tf.test.is_built_with_cuda(), tf.config.list_physical_devices('GPU'))"

    - name: Run all tests on GPU
      env:
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,10 @@ __pycache__/
 # C extensions
 *.so

+# tests and logs
+tests/fixtures
+logs/
+
 # Distribution / packaging
 .Python
 build/
@@ -116,6 +120,7 @@ dmypy.json
 .pyre/

 # vscode
+.vs
 .vscode

 # Pycharm
@@ -130,7 +135,10 @@ proc_data

 # examples
 runs
-examples/runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args

 # data
 /data
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -44,9 +44,16 @@ Did not find it? :( So we can act quickly on it, please follow these steps:
 To get the OS and software versions automatically, you can run the following command:

 ```bash
-python transformers-cli env
+transformers-cli env
 ```

+or from the root of the repository the following command:
+
+```bash
+python src/transformers/commands/transformers_cli.py env
+```
+
+
 ### Do you want to implement a new model?

 Awesome! Please provide the following information:
@@ -58,7 +65,8 @@ Awesome! Please provide the following information:
 If you are willing to contribute the model yourself, let us know so we can best
 guide you.

-We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder.
+We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them 
+in the [`templates`](https://github.com/huggingface/transformers/templates) folder.

 ### Do you want a new feature (that is not a model)?

@@ -79,7 +87,9 @@ A world-class feature request addresses the following points:
 If your issue is well written we're already 80% of the way there by the time you
 post it.

-We have added **templates** to guide you in the process of adding a new example script for training or testing the models in the library. You can find them in the [`templates`](./templates) folder.
+We have added **templates** to guide you in the process of adding a new example script for training or testing the 
+models in the library. You can find them in the [`templates`](https://github.com/huggingface/transformers/templates) 
+folder.

 ## Start contributing! (Pull Requests)

@@ -130,7 +140,6 @@ Follow these steps to start contributing:
   ```bash
   $ pip install -U git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
   ```
-
 5. Develop the features on your branch.

   As you work on the features, you should make sure that the test suite
@@ -199,15 +208,22 @@ Follow these steps to start contributing:
   are useful to avoid duplicated work, and to differentiate it from PRs ready
   to be merged;
 4. Make sure existing tests pass;
-5. Add high-coverage tests. No quality test, no merge. 
- - If you are adding a new model, make sure that you use `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)`, which triggers the common tests.
- - If you are adding new `@slow` tests, make sure they pass using `RUN_SLOW=1 python -m pytest tests/test_my_new_model.py`. 
-CircleCI does not run them. 
-6. All public methods must have informative docstrings;
+5. Add high-coverage tests. No quality testing = no merge. 
+   - If you are adding a new model, make sure that you use 
+     `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)`, which triggers the common tests.
+   - If you are adding new `@slow` tests, make sure they pass using 
+     `RUN_SLOW=1 python -m pytest tests/test_my_new_model.py`. 
+   - If you are adding a new tokenizer, write tests, and make sure 
+     `RUN_SLOW=1 python -m pytest tests/test_tokenization_{your_model_name}.py` passes.
+   CircleCI does not run the slow tests. 
+6. All public methods must have informative docstrings that work nicely with sphinx. See `modeling_ctrl.py` for an 
+   example.

 ### Tests

-You can run 🤗 Transformers tests with `unittest` or `pytest`.
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in 
+the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the 
+[examples folder](https://github.com/huggingface/transformers/tree/master/examples).

 We like `pytest` and `pytest-xdist` because it's faster. From the root of the
 repository, here's how to run tests with `pytest` for the library:
@@ -254,7 +270,8 @@ $ python -m unittest discover -s examples -t examples -v

 ### Style guide

-For documentation strings, `transformers` follows the [google
-style](https://google.github.io/styleguide/pyguide.html).
+For documentation strings, `transformers` follows the [google style](https://google.github.io/styleguide/pyguide.html).
+Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/master/docs#writing-documentation---specification)
+for more information.

 #### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md)
--- a/README.md
+++ b/README.md
@@ -19,17 +19,15 @@
 </p>

 <h3 align="center">
-<p>State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
+<p>State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0
 </h3>

-🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet, CTRL...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
+🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet, T5, CTRL...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over thousands of pretrained models in 100+ languages and deep interoperability between PyTorch & TensorFlow 2.0.

+### Recent contributors
 [![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/0)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/0)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/1)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/1)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/2)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/2)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/3)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/3)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/4)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/4)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/5)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/5)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/6)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/6)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/7)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/7)

 ### Features
-
- As easy to use as pytorch-transformers
- As powerful and concise as Keras
 - High performance on NLU and NLG tasks
 - Low barrier to entry for educators and practitioners

@@ -41,7 +39,7 @@ State-of-the-art NLP for everyone
 Lower compute costs, smaller carbon footprint
 - Researchers can share trained models instead of always retraining
 - Practitioners can reduce compute time and production costs
- 10 architectures with over 30 pretrained models, some in more than 100 languages
+- Dozens of architectures with over 1,000 pretrained models, some in more than 100 languages

 Choose the right framework for every part of a model's lifetime
 - Train state-of-the-art models in 3 lines of code
@@ -62,11 +60,11 @@ Choose the right framework for every part of a model's lifetime
 | [Quick tour: Share your models ](#Quick-tour-of-model-sharing) | Upload and share your fine-tuned models with the community |
 | [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
 | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
-| [Documentation][(v2.5.0)](https://huggingface.co/transformers/v2.5.0)[(v2.4.0/v2.4.1)](https://huggingface.co/transformers/v2.4.0)[(v2.3.0)](https://huggingface.co/transformers/v2.3.0)[(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
+| [Documentation](https://huggingface.co/transformers/) | Full API documentation and more |

 ## Installation

-This repo is tested on Python 3.6+, PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
+This repo is tested on Python 3.6+, PyTorch 1.0.0+ (PyTorch 1.3.1+ for examples) and TensorFlow 2.0.

 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).

@@ -148,26 +146,31 @@ At some point in the future, you'll be able to seamlessly move from pre-training

 🤗 Transformers currently provides the following NLU/NLG architectures:

-1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-3. **[GPT-2](https://blog.openai.com/better-language-models/)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
-9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-11. **[ALBERT](https://github.com/google-research/ALBERT)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-12. **[T5](https://github.com/google-research/text-to-text-transfer-transformer)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-13. **[XLM-RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/xlmr)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+2. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+3. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+4. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+5. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+6. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+7. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+8. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
+9. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+10. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+11. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+12. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+13. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 14. **[MMBT](https://github.com/facebookresearch/mmbt/)** (from Facebook), released together with the paper a [Supervised Multimodal Bitransformers for Classifying Images and Text](https://arxiv.org/pdf/1909.02950.pdf) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
-15. **[FlauBERT](https://github.com/getalp/Flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-16. **[BART](https://github.com/pytorch/fairseq/tree/master/examples/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-17. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
-18. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
+15. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+16. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+17. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+18. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+19. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+20. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+21. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+22. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
+23. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.

-These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
+These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Pearson R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).

 ## Online demo

@@ -285,8 +288,8 @@ pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf
 sentence_0 = "This research was consistent with his findings."
 sentence_1 = "His findings were compatible with this research."
 sentence_2 = "His findings were not compatible with this research."
-inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
-inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
+inputs_1 = tokenizer(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
+inputs_2 = tokenizer(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')

 pred_1 = pytorch_model(inputs_1['input_ids'], token_type_ids=inputs_1['token_type_ids'])[0].argmax().item()
 pred_2 = pytorch_model(inputs_2['input_ids'], token_type_ids=inputs_2['token_type_ids'])[0].argmax().item()
@@ -304,8 +307,9 @@ setup your environment to run the examples.

 The library comprises several example scripts with SOTA performances for NLU and NLG tasks:

- `run_glue.py`: an example fine-tuning Bert, XLNet and XLM on nine different GLUE tasks (*sequence-level classification*)
- `run_squad.py`: an example fine-tuning Bert, XLNet and XLM on the question answering dataset SQuAD 2.0 (*token-level classification*)
+- `run_glue.py`: an example fine-tuning sequence classification models on nine different GLUE tasks (*sequence-level classification*)
+- `run_squad.py`: an example fine-tuning question answering models on the question answering dataset SQuAD 2.0 (*token-level classification*)
+- `run_ner.py`: an example fine-tuning token classification models on named entity recognition (*token-level classification*)
 - `run_generation.py`: an example using GPT, GPT-2, CTRL, Transformer-XL and XLNet for conditional language generation
 - other model-specific examples (see the documentation).

@@ -315,7 +319,7 @@ Here are three quick usage examples for these scripts:

 The [General Language Understanding Evaluation (GLUE) benchmark](https://gluebenchmark.com/) is a collection of nine sentence- or sentence-pair language understanding tasks for evaluating and analyzing natural language understanding systems.

-Before running anyone of these GLUE tasks you should download the
+Before running any of these GLUE tasks you should download the
 [GLUE data](https://gluebenchmark.com/tasks) by running
 [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
 and unpack it to some directory `$GLUE_DIR`.
@@ -330,17 +334,15 @@ pip install -r ./examples/requirements.txt
 export GLUE_DIR=/path/to/glue
 export TASK_NAME=MRPC

-python ./examples/run_glue.py \
-    --model_type bert \
+python ./examples/text-classification/run_glue.py \
    --model_name_or_path bert-base-uncased \
    --task_name $TASK_NAME \
    --do_train \
    --do_eval \
-    --do_lower_case \
    --data_dir $GLUE_DIR/$TASK_NAME \
    --max_seq_length 128 \
-    --per_gpu_eval_batch_size=8   \
-    --per_gpu_train_batch_size=8   \
+    --per_device_eval_batch_size=8   \
+    --per_device_train_batch_size=8   \
    --learning_rate 2e-5 \
    --num_train_epochs 3.0 \
    --output_dir /tmp/$TASK_NAME/
@@ -358,8 +360,7 @@ Parallel training is a simple way to use several GPUs (but is slower and less fl
 ```shell
 export GLUE_DIR=/path/to/glue

-python ./examples/run_glue.py \
-    --model_type xlnet \
+python ./examples/text-classification/run_glue.py \
    --model_name_or_path xlnet-large-cased \
    --do_train  \
    --do_eval   \
@@ -367,8 +368,8 @@ python ./examples/run_glue.py \
    --data_dir=${GLUE_DIR}/STS-B  \
    --output_dir=./proc_data/sts-b-110   \
    --max_seq_length=128   \
-    --per_gpu_eval_batch_size=8   \
-    --per_gpu_train_batch_size=8   \
+    --per_device_eval_batch_size=8   \
+    --per_device_train_batch_size=8   \
    --gradient_accumulation_steps=1 \
    --max_steps=1200  \
    --model_name=xlnet-large-cased   \
@@ -384,17 +385,15 @@ On this machine we thus have a batch size of 32, please increase `gradient_accum
 This example code fine-tunes the Bert Whole Word Masking model on the Microsoft Research Paraphrase Corpus (MRPC) corpus using distributed training on 8 V100 GPUs to reach a F1 > 92.

 ```bash
-python -m torch.distributed.launch --nproc_per_node 8 ./examples/run_glue.py   \
-    --model_type bert \
+python -m torch.distributed.launch --nproc_per_node 8 ./examples/text-classification/run_glue.py   \
    --model_name_or_path bert-large-uncased-whole-word-masking \
    --task_name MRPC \
    --do_train   \
    --do_eval   \
-    --do_lower_case   \
    --data_dir $GLUE_DIR/MRPC/   \
    --max_seq_length 128   \
-    --per_gpu_eval_batch_size=8   \
-    --per_gpu_train_batch_size=8   \
+    --per_device_eval_batch_size=8   \
+    --per_device_train_batch_size=8   \
    --learning_rate 2e-5   \
    --num_train_epochs 3.0  \
    --output_dir /tmp/mrpc_output/ \
@@ -418,12 +417,11 @@ Training with these hyper-parameters gave us the following results:
 This example code fine-tunes BERT on the SQuAD dataset using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:

 ```bash
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \
+python -m torch.distributed.launch --nproc_per_node=8 ./examples/question-answering/run_squad.py \
    --model_type bert \
    --model_name_or_path bert-large-uncased-whole-word-masking \
    --do_train \
    --do_eval \
-    --do_lower_case \
    --train_file $SQUAD_DIR/train-v1.1.json \
    --predict_file $SQUAD_DIR/dev-v1.1.json \
    --learning_rate 3e-5 \
@@ -431,8 +429,8 @@ python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \
    --max_seq_length 384 \
    --doc_stride 128 \
    --output_dir ../models/wwm_uncased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=3   \
-    --per_gpu_train_batch_size=3   \
+    --per_device_eval_batch_size=3   \
+    --per_device_train_batch_size=3   \
 ```

 Training with these hyper-parameters gave us the following results:
@@ -452,7 +450,7 @@ The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-g
 Here is how to run the script with the small version of OpenAI GPT-2 model:

 ```shell
-python ./examples/run_generation.py \
+python ./examples/text-generation/run_generation.py \
    --model_type=gpt2 \
    --length=20 \
    --model_name_or_path=gpt2 \
@@ -460,7 +458,7 @@ python ./examples/run_generation.py \

 and from the Salesforce CTRL model:
 ```shell
-python ./examples/run_generation.py \
+python ./examples/text-generation/run_generation.py \
    --model_type=ctrl \
    --length=20 \
    --model_name_or_path=ctrl \
@@ -537,22 +535,25 @@ You can create `Pipeline` objects for the following down-stream tasks:
 - `text-classification`: Initialize a `TextClassificationPipeline` directly, or see `sentiment-analysis` for an example.
 - `question-answering`: Provided some context and a question refering to the context, it will extract the answer to the question in the context.
 - `fill-mask`: Takes an input sequence containing a masked token (e.g. `<mask>`) and return list of most probable filled sequences, with their probabilities.
+ - `summarization`
+ - `translation_xx_to_yy`

 ```python
-from transformers import pipeline
+>>> from transformers import pipeline

 # Allocate a pipeline for sentiment-analysis
-nlp = pipeline('sentiment-analysis')
-nlp('We are very happy to include pipeline into the transformers repository.')
->>> {'label': 'POSITIVE', 'score': 0.99893874}
+>>> nlp = pipeline('sentiment-analysis')
+>>> nlp('We are very happy to include pipeline into the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9978193640708923}]

 # Allocate a pipeline for question-answering
-nlp = pipeline('question-answering')
-nlp({
-    'question': 'What is the name of the repository ?',
-    'context': 'Pipeline have been included in the huggingface/transformers repository'
-})
->>> {'score': 0.28756016668193496, 'start': 35, 'end': 59, 'answer': 'huggingface/transformers'}
+>>> nlp = pipeline('question-answering')
+>>> nlp({
+...     'question': 'What is the name of the repository ?',
+...     'context': 'Pipeline have been included in the huggingface/transformers repository'
+... })
+{'score': 0.5135612454720828, 'start': 35, 'end': 59, 'answer': 'huggingface/transformers'}
+
 ```

 ## Migrating from pytorch-transformers to transformers
--- a/codecov.yml
+++ b/codecov.yml
@@ -0,0 +1,6 @@
+coverage:
+  status:
+    project:
+      default:
+        informational: true
+    patch: off
--- a/docs/README.md
+++ b/docs/README.md
@@ -7,6 +7,14 @@ you can install them with the following command, at the root of the code reposit
 pip install -e ".[docs]"
 ```

+---
+**NOTE**
+
+You only need to generate the documentation to inspect it locally (if you're planning changes and want to 
+check how they look like before committing for instance). You don't have to commit the built documentation.
+
+---
+
 ## Packages installed

 Here's an overview of all the packages installed. If you ran the previous command installing all packages from
@@ -34,19 +42,15 @@ pip install recommonmark

 ## Building the documentation

-Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following
-command to generate it:
-
-```bash
-ln -s ../../examples/README.md examples.md
-```
-
 Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:

 ```bash
 make html
 ```

+A folder called ``_build/html`` should have been created. You can now open the file ``_build/html/index.html`` in your
+browser. 
+
 ---
 **NOTE**

@@ -65,3 +69,143 @@ It should build the static app that will be available under `/docs/_build/html`

 Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
 in the source directory. You can then link it to the toc-tree by putting the filename without the extension.
+
+## Preview the documentation in a pull request
+
+Once you have made your pull request, you can check what the documentation will look like after it's merged by
+following these steps:
+
+- Look at the checks at the bottom of the conversation page of your PR (you may need to click on "show all checks" to
+  expand them).
+- Click on "details" next to the `ci/circleci: build_doc` check.
+- In the new window, click on the "Artifacts" tab.
+- Locate the file "docs/_build/html/index.html" (or any specific page you want to check) and click on it to get a 
+  preview.
+
+## Writing Documentation - Specification
+
+The `huggingface/transformers` documentation follows the
+[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style. It is
+mostly written in ReStructuredText 
+([Sphinx simple documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html), 
+[Sourceforge complete documentation](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html))
+
+### Adding a new section
+
+A section is a page held in the `Notes` toc-tree on the documentation. Adding a new section is done in two steps:
+
+- Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
+- Link that file in `./source/index.rst` on the correct toc-tree.
+
+### Adding a new model
+
+When adding a new model:
+ 
+- Create a file `xxx.rst` under `./source/model_doc`. 
+- Link that file in `./source/index.rst` on the `model_doc` toc-tree.
+- Write a short overview of the model:
+    - Overview with paper & authors
+    - Paper abstract
+    - Tips and tricks and how to use it best
+- Add the classes that should be linked in the model. This generally includes the configuration, the tokenizer, and
+  every model of that class (the base model, alongside models with additional heads), both in PyTorch and TensorFlow.
+  The order is generally: 
+    - Configuration, 
+    - Tokenizer
+    - PyTorch base model
+    - PyTorch head models
+    - TensorFlow base model
+    - TensorFlow head models
+
+These classes should be added using the RST syntax. Usually as follows:
+```
+XXXConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XXXConfig
+    :members:
+```
+
+This will include every public method of the configuration. If for some reason you wish for a method not to be
+displayed in the documentation, you can do so by specifying which methods should be in the docs:
+
+```
+XXXTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XXXTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+```
+
+### Writing source documentation
+
+Values that should be put in `code` should either be surrounded by double backticks: \`\`like so\`\` or be written as
+an object using the :obj: syntax: :obj:\`like so\`.
+
+When mentionning a class, it is recommended to use the :class: syntax as the mentioned class will be automatically
+linked by Sphinx: :class:\`transformers.XXXClass\`
+
+When mentioning a function, it is recommended to use the :func: syntax as the mentioned method will be automatically
+linked by Sphinx: :func:\`transformers.XXXClass.method\`
+
+Links should be done as so (note the double underscore at the end): \`text for the link <./local-link-or-global-link#loc>\`__
+
+#### Defining arguments in a method
+
+Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation. 
+The argument should be followed by its type, with its shape if it is a tensor, and a line return.
+Another indentation is necessary before writing the description of the argument.
+
+Here's an example showcasing everything so far:
+
+```
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`transformers.AlbertTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+```
+
+#### Writing a multi-line code block 
+
+Multi-line code blocks can be useful for displaying examples. They are done like so:
+
+```
+Example::
+
+    # first line of code
+    # second line
+    # etc
+```
+
+The `Example` string at the beginning can be replaced by anything as long as there are two semicolons following it.
+
+#### Writing a return block
+
+Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation. 
+The first line should be the type of the return, followed by a line return. No need to indent further for the elements
+building the return.
+
+Here's an example for tuple return, comprising several objects:
+
+```
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
+        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+```
+
+Here's an example for a single value return:
+
+```
+    Returns:
+        A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+```
--- a/docs/source/_static/css/code-snippets.css
+++ b/docs/source/_static/css/code-snippets.css
@@ -9,4 +9,8 @@

 .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow {
    color: #6670FF;
+}
+
+.highlight .gp {
+    color: #FB8D68;
 }
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -1,9 +1,50 @@
 /* Our DOM objects */

+/* Version control */
+
+.version-button {
+    background-color: #6670FF;
+    color: white;
+    border: none;
+    padding: 5px;
+    font-size: 15px;
+    cursor: pointer;
+}
+
+.version-button:hover, .version-button:focus {
+    background-color: #A6B0FF;
+}
+ 
+.version-dropdown {
+    display: none;
+    background-color: #6670FF;
+    min-width: 160px;
+    overflow: auto;
+    font-size: 15px;
+}
+  
+.version-dropdown a {
+    color: white;
+    padding: 3px 4px;
+    text-decoration: none;
+    display: block;
+}
+  
+.version-dropdown a:hover {
+    background-color: #A6B0FF;
+}
+  
+.version-show {
+    display: block;
+}
+
+/* Framework selector */
+
 .framework-selector {
    display: flex;
    flex-direction: row;
    justify-content: flex-end;
+    margin-right: 30px;
 }

 .framework-selector > button {
@@ -20,6 +61,12 @@
    padding: 5px;
 }

+/* Copy button */
+
+a.copybtn {
+    margin: 3px;
+}
+
 /* The literal code blocks */
 .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
    color: #6670FF;
@@ -38,6 +85,7 @@

 /* The research field on top of the toc tree */
 .wy-side-nav-search{
+    padding-top: 0;
    background-color: #6670FF;
 }

--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,3 +1,26 @@
+// These two things need to be updated at each release for the version selector.
+// Last stable version
+const stableVersion = "v2.11.0"
+// Dictionary doc folder to label
+const versionMapping = {
+    "master": "master",
+    "": "v2.11.0 (stable)",
+    "v2.10.0": "v2.10.0",
+    "v2.9.1": "v2.9.0/v2.9.1",
+    "v2.8.0": "v2.8.0",
+    "v2.7.0": "v2.7.0",
+    "v2.6.0": "v2.6.0",
+    "v2.5.1": "v2.5.0/v2.5.1",
+    "v2.4.0": "v2.4.0/v2.4.1",
+    "v2.3.0": "v2.3.0",
+    "v2.2.0": "v2.2.0/v2.2.1/v2.2.2",
+    "v2.1.1": "v2.1.1",
+    "v2.0.0": "v2.0.0",
+    "v1.2.0": "v1.2.0",
+    "v1.1.0": "v1.1.0",
+    "v1.0.0": "v1.0.0"
+}
+
 function addIcon() {
    const huggingFaceLogo = "https://huggingface.co/landing/assets/transformers-docs/huggingface_logo.svg";
    const image = document.createElement("img");
@@ -58,6 +81,68 @@ function addGithubButton() {
    document.querySelector(".wy-side-nav-search .icon-home").insertAdjacentHTML('afterend', div);
 }

+function addVersionControl() {
+    // To grab the version currently in view, we parse the url
+    const parts = location.toString().split('/');
+    let versionIndex = parts.length - 2;
+    // Index page may not have a last part with filename.html so we need to go up
+    if (parts[parts.length - 1] != "" && ! parts[parts.length - 1].match(/\.html$/)) {
+        versionIndex = parts.length - 1;
+    }
+    // Main classes and models are nested so we need to go deeper
+    else if (parts[versionIndex] == "main_classes" || parts[versionIndex] == "model_doc") {
+        versionIndex = versionIndex - 1;
+    } 
+    const version = parts[versionIndex];
+
+    // Menu with all the links,
+    const versionMenu = document.createElement("div");
+
+    const htmlLines = [];
+    for (const [key, value] of Object.entries(versionMapping)) {
+        let baseUrlIndex = (version == "transformers") ? versionIndex + 1: versionIndex;
+        var urlParts = parts.slice(0, baseUrlIndex);
+        if (key != "") {
+            urlParts = urlParts.concat([key]);
+        }
+        urlParts = urlParts.concat(parts.slice(versionIndex+1));
+        htmlLines.push(`<a href="${urlParts.join('/')}">${value}</a>`);
+    }
+
+    versionMenu.classList.add("version-dropdown");
+    versionMenu.innerHTML = htmlLines.join('\n');
+    
+    // Button for version selection
+    const versionButton = document.createElement("div");
+    versionButton.classList.add("version-button");
+    let label = (version == "transformers") ? stableVersion : version
+    versionButton.innerText = label.concat(" ▼");
+
+    // Toggle the menu when we click on the button
+    versionButton.addEventListener("click", () => {
+        versionMenu.classList.toggle("version-show");
+    });
+
+    // Hide the menu when we click elsewhere
+    window.addEventListener("click", (event) => {
+        if (event.target != versionButton){
+            versionMenu.classList.remove('version-show');
+        }
+    });
+
+    // Container
+    const div = document.createElement("div");
+    div.appendChild(versionButton);
+    div.appendChild(versionMenu);
+    div.style.paddingTop = '25px';
+    div.style.backgroundColor = '#6670FF';
+    div.style.display = 'block';
+    div.style.textAlign = 'center';
+
+    const scrollDiv = document.querySelector(".wy-side-scroll");
+    scrollDiv.insertBefore(div, scrollDiv.children[1]);
+}
+
 function addHfMenu() {
    const div = `
    <div class="menu">
@@ -72,6 +157,8 @@ function platformToggle() {
    const codeBlocks = Array.from(document.getElementsByClassName("highlight"));
    const pytorchIdentifier = "## PYTORCH CODE";
    const tensorflowIdentifier = "## TENSORFLOW CODE";
+
+    const promptSpanIdentifier = `<span class="gp">&gt;&gt;&gt; </span>`
    const pytorchSpanIdentifier = `<span class="c1">${pytorchIdentifier}</span>`;
    const tensorflowSpanIdentifier = `<span class="c1">${tensorflowIdentifier}</span>`;

@@ -84,10 +171,22 @@ function platformToggle() {
        let tensorflowSpans;

        if(pytorchSpanPosition < tensorflowSpanPosition){
-            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, tensorflowSpanPosition);
+            const isPrompt = spans.slice(
+                spans.indexOf(tensorflowSpanIdentifier) - promptSpanIdentifier.length,
+                spans.indexOf(tensorflowSpanIdentifier)
+            ) == promptSpanIdentifier;
+            const finalTensorflowSpanPosition = isPrompt ? tensorflowSpanPosition - promptSpanIdentifier.length : tensorflowSpanPosition;
+
+            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, finalTensorflowSpanPosition);
            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, spans.length);
        }else{
-            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, pytorchSpanPosition);
+            const isPrompt = spans.slice(
+                spans.indexOf(pytorchSpanIdentifier) - promptSpanIdentifier.length,
+                spans.indexOf(pytorchSpanIdentifier)
+            ) == promptSpanIdentifier;
+            const finalPytorchSpanPosition = isPrompt ? pytorchSpanPosition - promptSpanIdentifier.length : pytorchSpanPosition;
+
+            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, finalPytorchSpanPosition);
            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, spans.length);
        }

@@ -149,6 +248,7 @@ function parseGithubButtons (){"use strict";var e=window.document,t=e.location,o

 function onLoad() {
    addIcon();
+    addVersionControl();
    addCustomFooter();
    addGithubButton();
    parseGithubButtons();
--- a/docs/source/benchmarks.md
+++ b/docs/source/benchmarks.md
@@ -1,54 +0,0 @@
-# Benchmarks
-
-This section is dedicated to the Benchmarks done by the library, both by maintainers, contributors and users. These 
-benchmark will help keep track of the preformance improvements that are brought to our models across versions.
-
-## Benchmarking all models for inference
-
-As of version 2.1 we have benchmarked all models for inference, across many different settings: using PyTorch, with
-and without TorchScript, using TensorFlow, with and without XLA. All of those tests were done across CPUs (except for
-TensorFlow XLA) and GPUs.
-
-The approach is detailed in the [following blogpost](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2)
-
-The results are available [here](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing).
-
-## TF2 with mixed precision, XLA, Distribution (@tlkh)
-
-This work was done by [Timothy Liu](https://github.com/tlkh).
-
-There are very positive results to be gained from the various TensorFlow 2.0 features:
-
- Automatic Mixed Precision (AMP)
- XLA compiler
- Distribution strategies (multi-GPU)
-
-The benefits are listed here (tested on CoLA, MRPC, SST-2):
-
- AMP: Between 1.4x to 1.6x decrease in overall time without change in batch size
- AMP+XLA: Up to 2.5x decrease in overall time on SST-2 (larger dataset)
- Distribution: Between 1.4x to 3.4x decrease in overall time on 4xV100
- Combined: Up to 5.7x decrease in overall training time, or 9.1x training throughput
-
-The model quality (measured by the validation accuracy) fluctuates slightly. Taking an average of 4 training runs 
-on a single GPU gives the following results:
-
- CoLA: AMP results in slighter lower acc (0.820 vs 0.824)
- MRPC: AMP results in lower acc (0.823 vs 0.835)
- SST-2: AMP results in slighter lower acc (0.918 vs 0.922)
-
-However, in a distributed setting with 4xV100 (4x batch size), AMP can yield in better results:
-
-CoLA: AMP results in higher acc (0.828 vs 0.812)
-MRPC: AMP results in lower acc (0.817 vs 0.827)
-SST-2: AMP results in slightly lower acc (0.926 vs 0.929)
-
-The benchmark script is available [here](https://github.com/NVAITC/benchmarking/blob/master/tf2/bert_dist.py).
-
-Note: on some tasks (e.g. MRPC), the dataset is too small. The overhead due to the model compilation with XLA as well
-as the distribution strategy setup does not speed things up. The XLA compile time is also the reason why although throughput 
-can increase a lot (e.g. 2.7x for single GPU), overall (end-to-end) training speed-up is not as fast (as low as 1.4x)
-
-The benefits as seen on SST-2 (larger dataset) is much clear.
-
-All results can be seen on this [Google Sheet](https://docs.google.com/spreadsheets/d/1538MN224EzjbRL239sqSiUy6YY-rAjHyXhTzz_Zptls/edit#gid=960868445).
--- a/docs/source/benchmarks.rst
+++ b/docs/source/benchmarks.rst
@@ -0,0 +1,322 @@
+Benchmarks
+==========
+
+Let's take a look at how 🤗 Transformer models can be benchmarked, best practices, and already available benchmarks.
+
+A notebook explaining in more detail how to benchmark 🤗 Transformer models can be found `here <https://github.com/huggingface/transformers/blob/master/notebooks/05-benchmark.ipynb>`__.
+
+How to benchmark 🤗 Transformer models
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The classes :class:`~transformers.PyTorchBenchmark` and :class:`~transformers.TensorFlowBenchmark` allow to flexibly benchmark 🤗 Transformer models.
+The benchmark classes allow us to measure the `peak memory usage` and `required time` for both 
+`inference` and `training`. 
+
+.. note::
+
+  Hereby, `inference` is defined by a single forward pass, and `training` is defined by a single forward pass and backward pass.
+
+The benchmark classes :class:`~transformers.PyTorchBenchmark` and :class:`~transformers.TensorFlowBenchmark` expect an object of type :class:`~transformers.PyTorchBenchmarkArguments` and :class:`~transformers.TensorFlowBenchmarkArguments`, respectively, for instantiation. :class:`~transformers.PyTorchBenchmarkArguments` and :class:`~transformers.TensorFlowBenchmarkArguments` are data classes and contain all relevant configurations for their corresponding benchmark class.
+In the following example, it is shown how a BERT model of type `bert-base-cased` can be benchmarked.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments
+
+    >>> args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+    >>> benchmark = PyTorchBenchmark(args)
+
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments
+
+    >>> args = TensorFlowBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+    >>> benchmark = TensorFlowBenchmark(args)
+
+
+Here, three arguments are given to the benchmark argument data classes, namely ``models``, ``batch_sizes``, and ``sequence_lengths``. The argument ``models`` is required and expects a :obj:`list` of model identifiers from the `model hub <https://huggingface.co/models>`__
+The :obj:`list` arguments ``batch_sizes`` and ``sequence_lengths`` define the size of the ``input_ids`` on which the model is benchmarked. 
+There are many more parameters that can be configured via the benchmark argument data classes. For more detail on these one can either directly consult the files 
+``src/transformers/benchmark/benchmark_args_utils.py``, ``src/transformers/benchmark/benchmark_args.py`` (for PyTorch) and ``src/transformers/benchmark/benchmark_args_tf.py`` (for Tensorflow). 
+Alternatively, running the following shell commands from root will print out a descriptive list of all configurable parameters for PyTorch and Tensorflow respectively.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    python examples/benchmarking/run_benchmark.py --help
+
+    >>> ## TENSORFLOW CODE
+    python examples/benchmarking/run_benchmark_tf.py --help
+
+
+An instantiated benchmark object can then simply be run by calling ``benchmark.run()``.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> results = benchmark.run()
+    >>> print(results)
+    ====================       INFERENCE - SPEED - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length     Time in s                  
+    --------------------------------------------------------------------------------
+    bert-base-uncased          8               8             0.006     
+    bert-base-uncased          8               32            0.006     
+    bert-base-uncased          8              128            0.018     
+    bert-base-uncased          8              512            0.088     
+    --------------------------------------------------------------------------------
+    
+    ====================      INFERENCE - MEMORY - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length    Memory in MB 
+    --------------------------------------------------------------------------------
+    bert-base-uncased          8               8             1227
+    bert-base-uncased          8               32            1281
+    bert-base-uncased          8              128            1307
+    bert-base-uncased          8              512            1539
+    --------------------------------------------------------------------------------
+    
+    ====================        ENVIRONMENT INFORMATION         ====================
+    - transformers_version: 2.11.0
+    - framework: PyTorch
+    - use_torchscript: False
+    - framework_version: 1.4.0
+    - python_version: 3.6.10
+    - system: Linux
+    - cpu: x86_64
+    - architecture: 64bit
+    - date: 2020-06-29
+    - time: 08:58:43.371351
+    - fp16: False
+    - use_multiprocessing: True
+    - only_pretrain_model: False
+    - cpu_ram_mb: 32088
+    - use_gpu: True
+    - num_gpus: 1
+    - gpu: TITAN RTX
+    - gpu_ram_mb: 24217
+    - gpu_power_watts: 280.0
+    - gpu_performance_state: 2
+    - use_tpu: False
+    
+    >>> ## TENSORFLOW CODE
+    >>> results = benchmark.run()
+    >>> print(results)
+    ====================       INFERENCE - SPEED - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length     Time in s                  
+    --------------------------------------------------------------------------------
+    bert-base-uncased          8               8             0.005
+    bert-base-uncased          8               32            0.008
+    bert-base-uncased          8              128            0.022
+    bert-base-uncased          8              512            0.105
+    --------------------------------------------------------------------------------
+    
+    ====================      INFERENCE - MEMORY - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length    Memory in MB 
+    --------------------------------------------------------------------------------
+    bert-base-uncased          8               8             1330
+    bert-base-uncased          8               32            1330
+    bert-base-uncased          8              128            1330
+    bert-base-uncased          8              512            1770
+    --------------------------------------------------------------------------------
+    
+    ====================        ENVIRONMENT INFORMATION         ====================
+    - transformers_version: 2.11.0
+    - framework: Tensorflow
+    - use_xla: False
+    - framework_version: 2.2.0
+    - python_version: 3.6.10
+    - system: Linux
+    - cpu: x86_64
+    - architecture: 64bit
+    - date: 2020-06-29
+    - time: 09:26:35.617317
+    - fp16: False
+    - use_multiprocessing: True
+    - only_pretrain_model: False
+    - cpu_ram_mb: 32088
+    - use_gpu: True
+    - num_gpus: 1
+    - gpu: TITAN RTX
+    - gpu_ram_mb: 24217
+    - gpu_power_watts: 280.0
+    - gpu_performance_state: 2
+    - use_tpu: False
+
+By default, the `time` and the `required memory` for `inference` are benchmarked. 
+In the example output above the first two sections show the result corresponding to `inference time` and `inference memory`. 
+In addition, all relevant information about the computing environment, `e.g.` the GPU type, the system, the library versions, etc... are printed out in the third section under `ENVIRONMENT INFORMATION`.
+This information can optionally be saved in a `.csv` file when adding the argument :obj:`save_to_csv=True` to :class:`~transformers.PyTorchBenchmarkArguments` and :class:`~transformers.TensorFlowBenchmarkArguments` respectively.
+In this case, every section is saved in a separate `.csv` file. The path to each `.csv` file can optionally be defined via the argument data classes.
+
+Instead of benchmarking pre-trained models via their model identifier, `e.g.` `bert-base-uncased`, the user can alternatively benchmark an arbitrary configuration of any available model class. 
+In this case, a :obj:`list` of configurations must be inserted with the benchmark args as follows.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, BertConfig
+
+    >>> args = PyTorchBenchmarkArguments(models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+    >>> config_base = BertConfig()
+    >>> config_384_hid = BertConfig(hidden_size=384)
+    >>> config_6_lay = BertConfig(num_hidden_layers=6)
+
+    >>> benchmark = PyTorchBenchmark(args, configs=[config_base, config_384_hid, config_6_lay])
+    >>> benchmark.run()
+    ====================       INFERENCE - SPEED - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length       Time in s                  
+    --------------------------------------------------------------------------------
+    bert-base                  8              128            0.006
+    bert-base                  8              512            0.006
+    bert-base                  8              128            0.018     
+    bert-base                  8              512            0.088     
+    bert-384-hid              8               8             0.006     
+    bert-384-hid              8               32            0.006     
+    bert-384-hid              8              128            0.011     
+    bert-384-hid              8              512            0.054     
+    bert-6-lay                 8               8             0.003     
+    bert-6-lay                 8               32            0.004     
+    bert-6-lay                 8              128            0.009     
+    bert-6-lay                 8              512            0.044
+    --------------------------------------------------------------------------------
+    
+    ====================      INFERENCE - MEMORY - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length      Memory in MB 
+    --------------------------------------------------------------------------------
+    bert-base                  8               8             1277
+    bert-base                  8               32            1281
+    bert-base                  8              128            1307     
+    bert-base                  8              512            1539     
+    bert-384-hid              8               8             1005     
+    bert-384-hid              8               32            1027     
+    bert-384-hid              8              128            1035     
+    bert-384-hid              8              512            1255     
+    bert-6-lay                 8               8             1097     
+    bert-6-lay                 8               32            1101     
+    bert-6-lay                 8              128            1127     
+    bert-6-lay                 8              512            1359
+    --------------------------------------------------------------------------------
+    
+    ====================        ENVIRONMENT INFORMATION         ====================
+    - transformers_version: 2.11.0
+    - framework: PyTorch
+    - use_torchscript: False
+    - framework_version: 1.4.0
+    - python_version: 3.6.10
+    - system: Linux
+    - cpu: x86_64
+    - architecture: 64bit
+    - date: 2020-06-29
+    - time: 09:35:25.143267
+    - fp16: False
+    - use_multiprocessing: True
+    - only_pretrain_model: False
+    - cpu_ram_mb: 32088
+    - use_gpu: True
+    - num_gpus: 1
+    - gpu: TITAN RTX
+    - gpu_ram_mb: 24217
+    - gpu_power_watts: 280.0
+    - gpu_performance_state: 2
+    - use_tpu: False
+
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig
+
+    >>> args = TensorFlowBenchmarkArguments(models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+    >>> config_base = BertConfig()
+    >>> config_384_hid = BertConfig(hidden_size=384)
+    >>> config_6_lay = BertConfig(num_hidden_layers=6)
+
+    >>> benchmark = TensorFlowBenchmark(args, configs=[config_base, config_384_hid, config_6_lay])
+    >>> benchmark.run()
+    ====================       INFERENCE - SPEED - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length       Time in s                  
+    --------------------------------------------------------------------------------
+    bert-base                  8               8             0.005
+    bert-base                  8               32            0.008
+    bert-base                  8              128            0.022
+    bert-base                  8              512            0.106
+    bert-384-hid              8               8             0.005
+    bert-384-hid              8               32            0.007
+    bert-384-hid              8              128            0.018
+    bert-384-hid              8              512            0.064
+    bert-6-lay                 8               8             0.002
+    bert-6-lay                 8               32            0.003
+    bert-6-lay                 8              128            0.0011
+    bert-6-lay                 8              512            0.074
+    --------------------------------------------------------------------------------
+    
+    ====================      INFERENCE - MEMORY - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length      Memory in MB 
+    --------------------------------------------------------------------------------
+    bert-base                  8               8             1330
+    bert-base                  8               32            1330
+    bert-base                  8              128            1330
+    bert-base                  8              512            1770
+    bert-384-hid              8               8             1330
+    bert-384-hid              8               32            1330
+    bert-384-hid              8              128            1330
+    bert-384-hid              8              512            1540
+    bert-6-lay                 8               8             1330
+    bert-6-lay                 8               32            1330
+    bert-6-lay                 8              128            1330
+    bert-6-lay                 8              512            1540
+    --------------------------------------------------------------------------------
+    
+    ====================        ENVIRONMENT INFORMATION         ====================
+    - transformers_version: 2.11.0
+    - framework: Tensorflow
+    - use_xla: False
+    - framework_version: 2.2.0
+    - python_version: 3.6.10
+    - system: Linux
+    - cpu: x86_64
+    - architecture: 64bit
+    - date: 2020-06-29
+    - time: 09:38:15.487125
+    - fp16: False
+    - use_multiprocessing: True
+    - only_pretrain_model: False
+    - cpu_ram_mb: 32088
+    - use_gpu: True
+    - num_gpus: 1
+    - gpu: TITAN RTX
+    - gpu_ram_mb: 24217
+    - gpu_power_watts: 280.0
+    - gpu_performance_state: 2
+    - use_tpu: False
+
+
+Again, `inference time` and `required memory` for `inference` are measured, but this time for customized configurations of the :obj:`BertModel` class. This feature can especially be helpful when 
+deciding for which configuration the model should be trained.
+
+
+Benchmark best practices
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+This section lists a couple of best practices one should be aware of when benchmarking a model.
+
+- Currently, only single device benchmarking is supported. When benchmarking on GPU, it is recommended that the user 
+  specifies on which device the code should be run by setting the ``CUDA_VISIBLE_DEVICES`` environment variable in the shell, `e.g.` ``export CUDA_VISIBLE_DEVICES=0`` before running the code.
+- The option :obj:`no_multi_processing` should only be set to :obj:`True` for testing and debugging. To ensure accurate memory measurement it is recommended to run each memory benchmark in a separate process by making sure :obj:`no_multi_processing` is set to :obj:`True`.
+- One should always state the environment information when sharing the results of a model benchmark. Results can vary heavily between different GPU devices, library versions, etc., so that benchmark results on their own are not very useful for the community.
+
+
+Sharing your benchmark
+~~~~~~~~~~~~~~~~~~~~~~
+
+Previously all available core models (10 at the time) have been benchmarked for `inference time`, across many different settings: using PyTorch, with
+and without TorchScript, using TensorFlow, with and without XLA. All of those tests were done across CPUs (except for
+TensorFlow XLA) and GPUs.
+
+The approach is detailed in the `following blogpost <https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2>`__ and the results are available `here <https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing>`__.
+
+With the new `benchmark` tools, it is easier than ever to share your benchmark results with the community `here <https://github.com/huggingface/transformers/blob/master/examples/benchmarking/README.md>`__.
--- a/docs/source/bertology.rst
+++ b/docs/source/bertology.rst
@@ -8,11 +8,11 @@ There is a growing field of study concerned with investigating the inner working
 * Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
 * What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341

-In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
+In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):


 * accessing all the hidden-states of BERT/GPT/GPT-2,
 * accessing all the attention weights for each head of BERT/GPT/GPT-2,
 * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.

-To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
+To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/bertology/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -26,7 +26,7 @@ author = u'huggingface'
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'2.7.0'
+release = u'3.0.0'


 # -- General configuration ---------------------------------------------------
@@ -44,7 +44,8 @@ extensions = [
    'sphinx.ext.napoleon',
    'recommonmark',
    'sphinx.ext.viewcode',
-    'sphinx_markdown_tables'
+    'sphinx_markdown_tables',
+    'sphinx_copybutton'
 ]

 # Add any paths that contain templates here, relative to this directory.
@@ -74,6 +75,8 @@ exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = None

+# Remove the prompt when copying examples
+copybutton_prompt_text = ">>> "

 # -- Options for HTML output -------------------------------------------------

@@ -187,8 +190,8 @@ epub_title = project
 epub_exclude_files = ['search.html']

 def setup(app):
-    app.add_stylesheet('css/huggingface.css')
-    app.add_stylesheet('css/code-snippets.css')
+    app.add_css_file('css/huggingface.css')
+    app.add_css_file('css/code-snippets.css')
    app.add_js_file('js/custom.js')

 # -- Extension configuration -------------------------------------------------
--- a/docs/source/contributing.md
+++ b/docs/source/contributing.md
@@ -0,0 +1 @@
+../../CONTRIBUTING.md
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -12,7 +12,7 @@ A command-line interface is provided to convert original Bert/GPT/GPT-2/Transfor
 BERT
 ^^^^

-You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
+You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_bert_original_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py>`_ script.

 This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).

@@ -33,6 +33,26 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas

 You can download Google's pre-trained models for the conversion `here <https://github.com/google-research/bert#pre-trained-models>`__.

+ALBERT
+^^^^^^
+
+Convert TensorFlow model checkpoints of ALBERT to PyTorch using the `convert_albert_original_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py>`_ script.
+
+The CLI takes as input a TensorFlow checkpoint (three files starting with ``model.ckpt-best``\ ) and the accompanying configuration file (\ ``albert_config.json``\ ), then creates and saves a PyTorch model. To run this conversion you will need to have TensorFlow and PyTorch installed.
+
+Here is an example of the conversion process for the pre-trained ``ALBERT Base`` model:
+
+.. code-block:: shell
+
+   export ALBERT_BASE_DIR=/path/to/albert/albert_base
+
+   transformers-cli convert --model_type albert \
+     --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
+     --config $ALBERT_BASE_DIR/albert_config.json \
+     --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
+
+You can download Google's pre-trained models for the conversion `here <https://github.com/google-research/albert#pre-trained-models>`__.
+
 OpenAI GPT
 ^^^^^^^^^^

--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -1,11 +1,41 @@
 Glossary
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^
+
+General terms
+-------------
+
+- autoencoding models: see MLM
+- autoregressive models: see CLM
+- CLM: causal language modeling, a pretraining task where the model reads the texts in order and has to predict the
+  next word. It's usually done by reading the whole sentence but using a mask inside the model to hide the future 
+  tokens at a certain timestep.
+- MLM: masked language modeling, a pretraining task where the model sees a corrupted version of the texts, usually done
+  by masking some tokens randomly, and has to predict the original text.
+- multimodal: a task taht combines texts with another kind of inputs (for instance images).
+- NLG: natural language generation, all tasks related to generating text ( for instance talk with transformers,
+  translation)
+- NLP: natural language processing, a generic way to say "deal with texts".
+- NLU: natural language understanding, all tasks related to understanding what is in a text (for instance classifying
+  the whole text, individual words)
+- pretrained model: a model that has been pretrained on some data (for instance all of Wikipedia). Pretraining methods
+  involve a self-supervised objective, which can be reading the text and trying to predict the next word (see CLM) or 
+  masking some words and trying to predict them (see MLM).
+- RNN: recurrent neural network, a type of model that uses a loop over a layer to process texts.
+- seq2seq or sequence-to-sequence: models that generate a new sequence from an input, like translation models, or
+  summarization models (such as :doc:`Bart </model_doc/bart>` or :doc:`T5 </model_doc/t5>`).
+- token: a part of a sentence, usually a word, but can also be a subword (non-common words are often split in subwords)
+  or a punctuation symbol.
+
+Model inputs
+------------

 Every model is different yet bears similarities with the others. Therefore most models use the same inputs, which are
 detailed here alongside usage examples.

+.. _input-ids:
+
 Input IDs
--------------------------
+~~~~~~~~~

 The input ids are often the only required parameters to be passed to the model as input. *They are token indices,
 numerical representations of tokens building the sequences that will be used as input by the model*.
@@ -15,33 +45,62 @@ tokenizer, which is a `WordPiece <https://arxiv.org/pdf/1609.08144.pdf>`__ token

 ::

-    from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    >>> from transformers import BertTokenizer
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

-    sequence = "A Titan RTX has 24GB of VRAM"
+    >>> sequence = "A Titan RTX has 24GB of VRAM"

 The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.

 ::

-    # Continuation of the previous script
-    tokenized_sequence = tokenizer.tokenize(sequence)
-    assert tokenized_sequence == ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
+    >>> tokenized_sequence = tokenizer.tokenize(sequence)

-These tokens can then be converted into IDs which are understandable by the model. Several methods are available for
-this, the recommended being `encode` or `encode_plus`, which leverage the Rust implementation of
+The tokens are either words or subwords. Here for instance, "VRAM" wasn't in the model vocabulary, so it's been split
+in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-dash is
+added for "RA" and "M":
+
+::
+
+    >>> print(tokenized_sequence)
+    ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
+
+These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
+the sentence to the tokenizer, which leverages the Rust implementation of
 `huggingface/tokenizers <https://github.com/huggingface/tokenizers>`__ for peak performance.

 ::

-    # Continuation of the previous script
-    encoded_sequence = tokenizer.encode(sequence)
-    assert encoded_sequence == [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
+    >>> encoded_sequence = tokenizer(sequence)["input_ids"]

-The `encode` and `encode_plus` methods automatically add "special tokens" which are special IDs the model uses.
+The tokenizer returns a dictionary with all the arguments necessary for its corresponding model to work properly. The
+token indices are under the key "input_ids":
+
+::
+
+    >>> print(encoded_sequence)
+    [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
+
+Note that the tokenizer automatically adds "special tokens" (if the associated model rely on them) which are special
+IDs the model sometimes uses. If we decode the previous sequence of ids,
+
+::
+
+    >>> decoded_sequence = tokenizer.decode(encoded_sequence)
+
+we will see 
+
+::
+
+    >>> print(decoded_sequence)
+    [CLS] A Titan RTX has 24GB of VRAM [SEP]
+
+because this is the way a :class:`~transformers.BertModel` is going to expect its inputs.
+
+.. _attention-mask:

 Attention mask
--------------------------
+~~~~~~~~~~~~~~

 The attention mask is an optional argument used when batching sequences together. This argument indicates to the
 model which tokens should be attended to, and which should not.
@@ -50,50 +109,53 @@ For example, consider these two sequences:

 ::

-    from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    >>> from transformers import BertTokenizer
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

-    sequence_a = "This is a short sequence."
-    sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
+    >>> sequence_a = "This is a short sequence."
+    >>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."

-    encoded_sequence_a = tokenizer.encode(sequence_a)
-    assert len(encoded_sequence_a) == 8
+    >>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
+    >>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]

-    encoded_sequence_b = tokenizer.encode(sequence_b)
-    assert len(encoded_sequence_b) == 19
-
-These two sequences have different lengths and therefore can't be put together in a same tensor as-is. The first
-sequence needs to be padded up to the length of the second one, or the second one needs to be truncated down to
-the length of the first one.
-
-In the first case, the list of IDs will be extended by the padding indices:
+The encoded versions have different lengths:

 ::

-    # Continuation of the previous script
-    padded_sequence_a = tokenizer.encode(sequence_a, max_length=19, pad_to_max_length=True)
+    >>> len(encoded_sequence_a), len(encoded_sequence_b)
+    (8, 19)

-    assert padded_sequence_a == [101, 1188, 1110, 170, 1603, 4954,  119, 102,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,   0]
-    assert encoded_sequence_b == [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]
+Therefore, we can't be put then together in a same tensor as-is. The first sequence needs to be padded up to the length
+of the second one, or the second one needs to be truncated down to the length of the first one.

-These can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating
+In the first case, the list of IDs will be extended by the padding indices. We can pass a list to the tokenizer and ask
+it to pad like this:
+
+::
+
+    >>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
+
+We can see that 0s have been added on the right of the first sentence to make it the same length as the second one:
+
+::
+
+    >>> padded_sequences["input_ids"]
+    [[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
+
+This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating
 the position of the padded indices so that the model does not attend to them. For the
 :class:`~transformers.BertTokenizer`, :obj:`1` indicate a value that should be attended to while :obj:`0` indicate
-a padded value.
-
-The method :func:`~transformers.PreTrainedTokenizer.encode_plus` may be used to obtain the attention mask directly:
+a padded value. This attention mask is in the dictionary returned by the tokenizer under the key "attention_mask":

 ::

-    # Continuation of the previous script
-    sequence_a_dict = tokenizer.encode_plus(sequence_a, max_length=19, pad_to_max_length=True)
-
-    assert sequence_a_dict['input_ids'] == [101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-    assert sequence_a_dict['attention_mask'] == [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    >>> padded_sequences["attention_mask"]
+    [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]

+.. _token-type-ids:

 Token Type IDs
--------------------------
+~~~~~~~~~~~~~~

 Some models' purpose is to do sequence classification or question answering. These require two different sequences to
 be encoded in the same input IDs. They are usually separated by special tokens, such as the classifier and separator
@@ -101,38 +163,47 @@ tokens. For example, the BERT model builds its two sequence input as such:

 ::

-    from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+   >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]

-    # [CLS] SEQ_A [SEP] SEQ_B [SEP]
-
-    sequence_a = "HuggingFace is based in NYC"
-    sequence_b = "Where is HuggingFace based?"
-
-    encoded_sequence = tokenizer.encode(sequence_a, sequence_b)
-    assert tokenizer.decode(encoded_sequence) == "[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]"
-
-This is enough for some models to understand where one sequence ends and where another begins. However, other models
-such as BERT have an additional mechanism, which are the segment IDs. The Token Type IDs are a binary mask identifying
-the different sequences in the model.
-
-We can leverage :func:`~transformers.PreTrainedTokenizer.encode_plus` to output the Token Type IDs for us:
+We can use our tokenizer to automatically generate such a sentence by passing the two sequences as two arguments (and
+not a list like before) like this:

 ::

-    # Continuation of the previous script
-    encoded_dict = tokenizer.encode_plus(sequence_a, sequence_b)
+    >>> from transformers import BertTokenizer
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    >>> sequence_a = "HuggingFace is based in NYC"
+    >>> sequence_b = "Where is HuggingFace based?"

-    assert encoded_dict['input_ids'] == [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102]
-    assert encoded_dict['token_type_ids'] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+    >>> encoded_dict = tokenizer(sequence_a, sequence_b)
+    >>> decoded = tokenizer.decode(encoded_dict["input_ids"])
+
+which will return:
+
+::
+
+    >>> print(decoded)
+    [CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
+
+This is enough for some models to understand where one sequence ends and where another begins. However, other models
+such as BERT have an additional mechanism, which are the token type IDs (also called segment IDs). They are a binary
+mask identifying the different sequences in the model.
+
+The tokenizer returns in the dictionary under the key "token_type_ids":
+
+::
+
+    >>> encoded_dict['token_type_ids']
+    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]

 The first sequence, the "context" used for the question, has all its tokens represented by :obj:`0`, whereas the
 question has all its tokens represented by :obj:`1`. Some models, like :class:`~transformers.XLNetModel` use an
 additional token represented by a :obj:`2`.

+.. _position-ids:

 Position IDs
--------------------------
+~~~~~~~~~~~~

 The position IDs are used by the model to identify which token is at which position. Contrary to RNNs that have the
 position of each token embedded within them, transformers are unaware of the position of each token. The position
@@ -143,3 +214,25 @@ positional embeddings.

 Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models
 use other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
+
+.. _feed-forward-chunking:
+
+Feed Forward Chunking
+~~~~~~~~~~~~~~~~~~~~~
+
+In transformers two feed forward layers usually follows the self attention layer in each residual attention block.
+The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (e.g.,
+for ``bert-base-uncased``). 
+
+For an input of size ``[batch_size, sequence_length]``, the memory required to store the intermediate feed forward
+embeddings ``[batch_size, sequence_length, config.intermediate_size]`` can account for a large fraction of the memory
+use. The authors of `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_ noticed that since the
+computation is independent of the ``sequence_length`` dimension, it is mathematically equivalent to compute the output
+embeddings of both feed forward layers ``[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n`` 
+individually and concat them afterward to ``[batch_size, sequence_length, config.hidden_size]`` with
+``n = sequence_length``, which trades increased computation time against reduced memory use, but yields a
+mathematically **equivalent** result.
+
+For models employing the function :func:`~.transformers.apply_chunking_to_forward`, the ``chunk_size`` defines the
+number of output embeddings that are computed in parallel and thus defines the trade-off between memory and time
+complexity.  If ``chunk_size`` is set to 0, no feed forward chunking is done.
--- a/docs/source/imgs/local_attention_mask.png
+++ b/docs/source/imgs/local_attention_mask.png
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,17 +1,18 @@
 Transformers
 ================================================================================================================================================

-🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose architectures
-(BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation
-(NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
+State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0.

-This is the documentation of our repository `transformers <https://github.com/huggingface/transformers>`__.
+🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose 
+architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural 
+Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between 
+TensorFlow 2.0 and PyTorch.
+
+This is the documentation of our repository `transformers <https://github.com/huggingface/transformers>`_.

 Features
 ---------------------------------------------------

- As easy to use as pytorch-transformers
- As powerful and concise as Keras
 - High performance on NLU and NLG tasks
 - Low barrier to entry for educators and practitioners

@@ -37,45 +38,134 @@ Choose the right framework for every part of a model's lifetime:
 Contents
 ---------------------------------

-The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
+The documentation is organized in five parts:

-1. `BERT <https://github.com/google-research/bert>`_ (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-2. `GPT <https://github.com/openai/finetune-transformer-lm>`_ (from OpenAI) released with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised>`_ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-3. `GPT-2 <https://blog.openai.com/better-language-models>`_ (from OpenAI) released with the paper `Language Models are Unsupervised Multitask Learners <https://blog.openai.com/better-language-models>`_ by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-4. `Transformer-XL <https://github.com/kimiyoung/transformer-xl>`_ (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
-7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
-9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
-11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-12. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-13. `FlauBERT <https://github.com/getalp/Flaubert>`_ (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+- **GET STARTED** contains a quick tour, the installation instructions and some useful information about our philosophy
+  and a glossary.
+- **USING 🤗 TRANSFORMERS** contains general tutorials on how to use the library.
+- **ADVANCED GUIDES** contains more advanced guides that are more specific to a given script or part of the library.
+- **RESEARCH** focuses on tutorials that have less to do with how to use the library but more about general resarch in
+  transformers model
+- **PACKAGE REFERENCE** contains the documentation of each public class and function.
+
+The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and
+conversion utilities for the following models:
+
+1. `BERT <https://github.com/google-research/bert>`_ (from Google) released with the paper `BERT: Pre-training of Deep
+   Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei
+   Chang, Kenton Lee, and Kristina Toutanova.
+2. `GPT <https://github.com/openai/finetune-transformer-lm>`_ (from OpenAI) released with the paper `Improving Language
+   Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised>`_ by Alec Radford, Karthik
+   Narasimhan, Tim Salimans, and Ilya Sutskever.
+3. `GPT-2 <https://blog.openai.com/better-language-models>`_ (from OpenAI) released with the paper `Language Models are
+   Unsupervised Multitask Learners <https://blog.openai.com/better-language-models>`_ by Alec Radford, Jeffrey Wu,
+   Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever.
+4. `Transformer-XL <https://github.com/kimiyoung/transformer-xl>`_ (from Google/CMU) released with the paper
+   `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_ by
+   Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, and Ruslan Salakhutdinov.
+5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized
+   Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang, Zihang
+   Dai, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, and Quoc V. Le.
+6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual
+   Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
+7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with
+   the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle
+   Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin
+   Stoyanov.
+8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together
+   with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter
+   <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut, and Thomas Wolf. The same method has been
+   applied to compress GPT2 into
+   `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
+9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the
+   paper `CTRL: A Conditional Transformer Language Model for Controllable Generation
+   <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar, Bryan McCann, Lav R. Varshney, Caiming Xiong,
+   and Richard Socher.
+10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université)
+    released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by
+    Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la
+    Clergerie, Djame Seddah, and Benoît Sagot.
+11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper
+    `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_
+    by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, and Radu Soricut.
+12. `T5 <https://github.com/google-research/text-to-text-transfer-transformer>`_ (from Google) released with the paper
+    `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
+    <https://arxiv.org/abs/1910.10683>`_ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
+    Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu.
+13. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together
+    with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by
+    Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard
+    Grave, Myle Ott, Luke Zettlemoyer, and Veselin Stoyanov.
+14. `MMBT <https://github.com/facebookresearch/mmbt/>`_ (from Facebook), released together with the paper a `Supervised
+    Multimodal Bitransformers for Classifying Images and Text <https://arxiv.org/pdf/1909.02950.pdf>`_ by Douwe Kiela,
+    Suvrat Bhooshan, Hamed Firooz, and Davide Testuggine.
+15. `FlauBERT <https://github.com/getalp/Flaubert>`_ (from CNRS) released with the paper `FlauBERT: Unsupervised
+    Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_ by Hang Le, Loïc Vial, Jibril Frej,
+    Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, and
+    Didier Schwab.
+16. `BART <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_ (from Facebook) released with the paper
+    `BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension
+    <https://arxiv.org/pdf/1910.13461.pdf>`_ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman
+    Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer.
+17. `ELECTRA <https://github.com/google-research/electra>`_ (from Google Research/Stanford University) released with
+    the paper `ELECTRA: Pre-training text encoders as discriminators rather than generators
+    <https://arxiv.org/abs/2003.10555>`_ by Kevin Clark, Minh-Thang Luong, Quoc V. Le, and Christopher D. Manning.
+18. `DialoGPT <https://github.com/microsoft/DialoGPT>`_ (from Microsoft Research) released with the paper `DialoGPT:
+    Large-Scale Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`_ by
+    Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu,
+    and Bill Dolan.
+19. `Reformer <https://github.com/google/trax/tree/master/trax/models/reformer>`_ (from Google Research) released with
+    the paper `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_ by Nikita Kitaev, Łukasz
+    Kaiser, and Anselm Levskaya.
+20. `MarianMT <https://marian-nmt.github.io/>`_ (developed by the Microsoft Translator Team) machine translation models
+    trained using `OPUS <http://opus.nlpl.eu/>`_ pretrained_models data by Jörg Tiedemann.
+21. `Longformer <https://github.com/allenai/longformer>`_ (from AllenAI) released with the paper `Longformer: The
+    Long-Document Transformer <https://arxiv.org/abs/2004.05150>`_ by Iz Beltagy, Matthew E. Peters, and Arman Cohan.
+22. `Other community models <https://huggingface.co/models>`_, contributed by the `community
+    <https://huggingface.co/users>`_.

 .. toctree::
    :maxdepth: 2
-    :caption: Notes
+    :caption: Get started

+    quicktour
    installation
-    quickstart
+    philosophy
    glossary
-    pretrained_models
-    usage
+
+.. toctree::
+    :maxdepth: 2
+    :caption: Using 🤗 Transformers
+
+    task_summary
+    model_summary
+    preprocessing
+    training
    model_sharing
+    multilingual
+
+.. toctree::
+    :maxdepth: 2
+    :caption: Advanced guides
+
+    pretrained_models
    examples
    notebooks
-    serialization
    converting_tensorflow_models
    migration
-    bertology
    torchscript
-    multilingual
+    contributing
+
+.. toctree::
+    :maxdepth: 2
+    :caption: Research
+
+    bertology
    benchmarks

 .. toctree::
    :maxdepth: 2
-    :caption: Main classes
+    :caption: Package Reference

    main_classes/configuration
    main_classes/model
@@ -83,12 +173,8 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
    main_classes/pipelines
    main_classes/optimizer_schedules
    main_classes/processors
-
-.. toctree::
-    :maxdepth: 2
-    :caption: Package Reference
-
    model_doc/auto
+    model_doc/encoderdecoder
    model_doc/bert
    model_doc/gpt
    model_doc/transformerxl
@@ -104,3 +190,10 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
    model_doc/flaubert
    model_doc/bart
    model_doc/t5
+    model_doc/electra
+    model_doc/dialogpt
+    model_doc/reformer
+    model_doc/marian
+    model_doc/longformer
+    model_doc/retribert
+    model_doc/mobilebert
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -1,51 +1,102 @@
 # Installation

-Transformers is tested on Python 3.6+ and PyTorch 1.1.0
+🤗 Transformers is tested on Python 3.6+, and PyTorch 1.1.0+ or TensorFlow 2.0+.

-## With pip
+You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're
+unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Create a virtual environment with the version of Python you're going 
+to use and activate it.

-PyTorch Transformers can be installed using pip as follows:
+Now, if you want to use 🤗 Transformers, you can install it with pip. If you'd like to play with the examples, you
+must install it from source.

-``` bash
+## Installation with pip
+
+First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) 
+and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific 
+install command for your platform.
+
+When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
+
+```bash
 pip install transformers
 ```

-## From source
+Alternatively, for CPU-support only, you can install 🤗 Transformers and PyTorch in one line with

-To install from source, clone the repository and install with:
+```bash
+pip install transformers[torch]
+```
+
+or 🤗 Transformers and TensorFlow 2.0 in one line with
+
+```bash
+pip install transformers[tf-cpu]
+```
+
+To check 🤗 Transformers is properly installed, run the following command:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))"
+```
+
+It should download a pretrained model then print something like
+
+```bash
+[{'label': 'NEGATIVE', 'score': 0.9991129040718079}]
+```
+
+(Note that TensorFlow will print additional stuff before that last statement.)
+
+## Installing from source
+
+To install from source, clone the repository and install with the following commands:

 ``` bash
 git clone https://github.com/huggingface/transformers.git
 cd transformers
-pip install .
+pip install -e .
 ```

-## Tests
+Again, you can run 

-An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
-
-Refer to the [contributing guide](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#tests) for details about running tests.
-
-## OpenAI GPT original tokenization workflow
-
-If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` and `SpaCy`:
-
-``` bash
-pip install spacy ftfy==4.4.3
-python -m spacy download en
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))"
 ```

-If you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
+to check 🤗 Transformers is properly installed.

-## Note on model downloads (Continuous Integration or large-scale deployments)
+## Caching models

-If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help.
+This library provides pretrained models that will be downloaded and cached locally. Unless you specify a location with
+`cache_dir=...` when you use methods like `from_pretrained`, these models will automatically be downloaded in the
+folder given by the shell environment variable ``TRANSFORMERS_CACHE``. The default value for it will be the PyTorch
+cache home followed by ``/transformers/`` (even if you don't have PyTorch installed). This is (by order of priority):
+
+  * shell environment variable ``ENV_TORCH_HOME``
+  * shell environment variable ``ENV_XDG_CACHE_HOME`` + ``/torch/``
+  * default: ``~/.cache/torch/``
+
+So if you don't have any specific environment variable set, the cache directory will be at
+``~/.cache/torch/transformers/``.
+
+**Note:** If you have set a shell enviromnent variable for one of the predecessors of this library
+(``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
+enviromnent variable for ``TRANSFORMERS_CACHE``.
+
+### Note on model downloads (Continuous Integration or large-scale deployments)
+
+If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through
+your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way
+faster, and cheaper. Feel free to contact us privately if you need any help.

 ## Do you want to run a Transformer model on a mobile device?

 You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo.

-It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
+It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, 
+`DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.

-At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
-or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
+At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch or
+TensorFlow 2.0 to productizing them in CoreML, or prototype a model or an app in CoreML then research its
+hyperparameters or architecture from PyTorch or TensorFlow 2.0. Super exciting!
--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@@ -14,6 +14,12 @@ The base class ``PreTrainedModel`` implements the common methods for loading/sav
 .. autoclass:: transformers.PreTrainedModel
    :members:

+``Helper Functions``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.apply_chunking_to_forward
+
+
 ``TFPreTrainedModel``
 ~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/main_classes/optimizer_schedules.rst
+++ b/docs/source/main_classes/optimizer_schedules.rst
@@ -17,7 +17,6 @@ The ``.optimization`` module provides:
 ~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.AdamWeightDecay
-    :members:

 .. autofunction:: transformers.create_optimizer

--- a/docs/source/main_classes/pipelines.rst
+++ b/docs/source/main_classes/pipelines.rst
@@ -7,8 +7,8 @@ Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction an

 There are two categories of pipeline abstractions to be aware about:

- The :class:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines
- The other task-specific pipelines, such as :class:`~transformers.NerPipeline`
+- The :func:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines
+- The other task-specific pipelines, such as :class:`~transformers.TokenClassificationPipeline`
  or :class:`~transformers.QuestionAnsweringPipeline`

 The pipeline abstraction
@@ -17,8 +17,7 @@ The pipeline abstraction
 The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any
 other pipeline but requires an additional argument which is the `task`.

-.. autoclass:: transformers.pipeline
-    :members:
+.. autofunction:: transformers.pipeline


 The task specific pipelines
@@ -30,15 +29,15 @@ Parent class: Pipeline
 .. autoclass:: transformers.Pipeline
    :members: predict, transform, save_pretrained

-NerPipeline
-==========================================
-
-.. autoclass:: transformers.NerPipeline
-
 TokenClassificationPipeline
 ==========================================

-This class is an alias of the :class:`~transformers.NerPipeline` defined above. Please refer to that pipeline for
+.. autoclass:: transformers.TokenClassificationPipeline
+
+NerPipeline
+==========================================
+
+This class is an alias of the :class:`~transformers.TokenClassificationPipeline` defined above. Please refer to that pipeline for
 documentation and usage examples.

 FillMaskPipeline
@@ -66,3 +65,9 @@ SummarizationPipeline
 ==========================================

 .. autoclass:: transformers.SummarizationPipeline
+
+
+TextGenerationPipeline
+==========================================
+
+.. autoclass:: transformers.TextGenerationPipeline
--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@@ -54,7 +54,7 @@ Additionally, the following method  can be used to load values from a data file
 Example usage
 ^^^^^^^^^^^^^^^^^^^^^^^^^

-An example using these processors is given in the `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
+An example using these processors is given in the `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/text-classification/run_glue.py>`__ script.


 XNLI
@@ -74,7 +74,7 @@ This library hosts the processor to load the XNLI data:
 Please note that since the gold labels are available on the test set, evaluation is performed on the test set.

 An example using these processors is given in the
-`run_xnli.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_xnli.py>`__ script.
+`run_xnli.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/text-classification/run_xnli.py>`__ script.


 SQuAD
@@ -150,4 +150,4 @@ Example::


 Another example using these processors is given in the
-`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/run_squad.py>`__ script.
+`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/question-answering/run_squad.py>`__ script.
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -1,16 +1,40 @@
 Tokenizer
 ----------------------------------------------------

-The base class ``PreTrainedTokenizer`` implements the common methods for loading/saving a tokenizer either from a local file or directory, or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository).
+A tokenizer is in charge of preparing the inputs for a model. The library comprise tokenizers for all the models. Most of the tokenizers are available in two flavors: a full python implementation and a "Fast" implementation based on the Rust library `tokenizers`. The "Fast" implementations allows (1) a significant speed-up in particular when doing batched tokenization and (2) additional methods to map between the original string (character and words) and the token space (e.g. getting the index of the token comprising a given character or the span of characters corresponding to a given token). Currently no "Fast" implementation is available for the SentencePiece-based tokenizers (for T5, ALBERT, CamemBERT, XLMRoBERTa and XLNet models).

-``PreTrainedTokenizer`` is the main entry point into tokenizers as it also implements the main methods for using all the tokenizers:
+The base classes ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` implements the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and "Fast" tokenizers either from a local file or directory or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository).

- tokenizing, converting tokens to ids and back and encoding/decoding,
+``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` thus implements the main methods for using all the tokenizers:
+
+- tokenizing (spliting strings in sub-word token strings), converting tokens strings to ids and back, and encoding/decoding (i.e. tokenizing + convert to integers),
 - adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...),
- managing special tokens (adding them, assigning them to roles, making sure they are not split during tokenization)
+- managing special tokens like mask, beginning-of-sentence, etc tokens (adding them, assigning them to attributes in the tokenizer for easy access and making sure they are not split during tokenization)
+
+``BatchEncoding`` holds the output of the tokenizer's encoding methods (``__call__``, ``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python tokenizer, this class behave just like a standard python dictionary and hold the various model inputs computed by these methodes (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e. backed by HuggingFace tokenizers library), this class provides in addition several advanced alignement methods which can be used to map between the original string (character and words) and the token space (e.g. getting the index of the token comprising a given character or the span of characters corresponding to a given token).

 ``PreTrainedTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PreTrainedTokenizer
+    :special-members: __call__
+    :members:
+
+``PreTrainedTokenizerFast``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.PreTrainedTokenizerFast
+    :special-members: __call__
+    :members:
+
+``BatchEncoding``
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BatchEncoding
+    :members:
+
+``SpecialTokensMixin``
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SpecialTokensMixin
    :members:
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -1,17 +1,30 @@
-# Migrating from pytorch-pretrained-bert
+# Migrating from previous packages

+## Migrating from pytorch-transformers to 🤗 Transformers

-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`
+Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to 🤗 Transformers.
+
+### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
+
+To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
+
+If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
+
+If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
+
+## Migrating from pytorch-pretrained-bert
+
+Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to 🤗 Transformers

 ### Models always output `tuples`

-The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+The main breaking change when migrating from `pytorch-pretrained-bert` to 🤗 Transformers is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.

 The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).

 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.

-Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
+Here is a `pytorch-pretrained-bert` to 🤗 Transformers conversion example for a `BertForSequenceClassification` classification model:

 ```python
 # Let's load our model
@@ -20,14 +33,14 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 # If you used to have this line in pytorch-pretrained-bert:
 loss = model(input_ids, labels=labels)

-# Now just use this line in transformers to extract the loss from the output tuple:
+# Now just use this line in 🤗 Transformers to extract the loss from the output tuple:
 outputs = model(input_ids, labels=labels)
 loss = outputs[0]

-# In transformers you can also have access to the logits:
+# In 🤗 Transformers you can also have access to the logits:
 loss, logits = outputs[:2]

-# And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
+# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
 model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=True)
 outputs = model(input_ids, labels=labels)
 loss, logits, attentions = outputs
@@ -96,7 +109,7 @@ for batch in train_data:
    loss.backward()
    optimizer.step()

-### In Transformers, optimizer and schedules are splitted and instantiated like this:
+### In 🤗 Transformers, optimizer and schedules are splitted and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
 scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler
 ### and used like this:
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@@ -6,7 +6,7 @@ Overview

 The ALBERT model was proposed in `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_
 by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
-two parameter-reduction techniques to lower memory consumption and increase the trainig speed of BERT:
+two parameter-reduction techniques to lower memory consumption and increase the training speed of BERT:

 - Splitting the embedding matrix into two smaller matrices
 - Using repeating layers split among groups
@@ -30,6 +30,8 @@ Tips:
  similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same
  number of (repeating) layers.

+The original code can be found `here <https://github.com/google-research/ALBERT>`_.
+
 AlbertConfig
 ~~~~~~~~~~~~~~~~~~~~~

@@ -66,6 +68,20 @@ AlbertForSequenceClassification
    :members:


+AlbertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertForMultipleChoice
+    :members:
+
+
+AlbertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertForTokenClassification
+    :members:
+
+
 AlbertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -92,3 +108,24 @@ TFAlbertForSequenceClassification

 .. autoclass:: transformers.TFAlbertForSequenceClassification
    :members:
+
+
+TFAlbertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForMultipleChoice
+    :members:
+
+
+TFAlbertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForTokenClassification
+    :members:
+
+
+TFAlbertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -1,11 +1,15 @@
 AutoModels
 -----------

-In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method.
+In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you
+are supplying to the ``from_pretrained`` method.

-AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary:
+AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path
+to the pretrained weights/config/vocabulary:

-Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``).
+Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant
+architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of 
+:class:`~transformers.BertModel`).


 ``AutoConfig``
@@ -30,36 +34,76 @@ Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will di


 ``AutoModelForPreTraining``
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.AutoModelForPreTraining
    :members:


 ``AutoModelWithLMHead``
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.AutoModelWithLMHead
    :members:


 ``AutoModelForSequenceClassification``
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.AutoModelForSequenceClassification
    :members:


 ``AutoModelForQuestionAnswering``
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.AutoModelForQuestionAnswering
    :members:


 ``AutoModelForTokenClassification``
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.AutoModelForTokenClassification
    :members:

+``TFAutoModel``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModel
+    :members:
+
+
+``TFAutoModelForPreTraining``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForPreTraining
+    :members:
+
+
+``TFAutoModelWithLMHead``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelWithLMHead
+    :members:
+
+
+``TFAutoModelForSequenceClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForSequenceClassification
+    :members:
+
+
+``TFAutoModelForQuestionAnswering``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForQuestionAnswering
+    :members:
+
+
+``TFAutoModelForTokenClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForTokenClassification
+    :members:
--- a/docs/source/model_doc/bart.rst
+++ b/docs/source/model_doc/bart.rst
@@ -1,11 +1,12 @@
 Bart
 ----------------------------------------------------
-**DISCLAIMER:** This model is still a work in progress, if you see something strange,
+**DISCLAIMER:** If you see something strange,
 file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
@sshleifer

-Paper
-~~~~~
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The Bart model was `proposed <https://arxiv.org/abs/1910.13461>`_ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019.
 According to the abstract,

@@ -16,14 +17,26 @@ According to the abstract,
 The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_


-Implementation Notes
-~~~~~~~~~~~~~~~~~~~~
+Implementation Notes:
+
 - Bart doesn't use :obj:`token_type_ids` for sequence classification. Use BartTokenizer.encode to get the proper splitting.
 - The forward pass of ``BartModel`` will create decoder inputs (using the helper function ``transformers.modeling_bart._prepare_bart_decoder_inputs``)  if they are not passed. This is different than some other modeling APIs.
 - Model predictions are intended to be identical to the original implementation. This only works, however, if the string you pass to ``fairseq.encode`` starts with a space.
 - ``BartForConditionalGeneration.generate`` should be used for conditional generation tasks like summarization, see the example in that docstrings
- Models that load the ``"bart-large-cnn"`` weights will not have a ``mask_token_id``, or be able to perform mask filling tasks.
+- Models that load the ``"facebook/bart-large-cnn"`` weights will not have a ``mask_token_id``, or be able to perform mask filling tasks.

+BartConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartConfig
+    :members:
+
+
+BartTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartTokenizer
+    :members:


 BartModel
@@ -35,6 +48,20 @@ BartModel
 .. autofunction:: transformers.modeling_bart._prepare_bart_decoder_inputs


+BartForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartForSequenceClassification
+    :members: forward
+
+
+BartForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartForQuestionAnswering
+    :members: forward
+
+
 BartForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -42,15 +69,3 @@ BartForConditionalGeneration
    :members: generate, forward


-BartForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartForSequenceClassification
-    :members: forward
-
-BartConfig
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartConfig
-    :members:
-
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -35,6 +35,8 @@ Tips:
  prediction rather than a token prediction. However, averaging over the sequence may yield better results than using
  the [CLS] token.

+The original code can be found `here <https://github.com/google-research/bert>`_.
+
 BertConfig
 ~~~~~~~~~~~~~~~~~~~~~

@@ -50,6 +52,13 @@ BertTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+BertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertTokenizerFast
+    :members:
+
+
 BertModel
 ~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/camembert.rst
+++ b/docs/source/model_doc/camembert.rst
@@ -1,6 +1,9 @@
 CamemBERT
 ----------------------------------------------------

+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The CamemBERT model was proposed in `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`__
 by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la
 Clergerie, Djamé Seddah, and Benoît Sagot. It is based on Facebook's RoBERTa model released in 2019. It is a model
@@ -22,6 +25,8 @@ Tips:
 - This implementation is the same as RoBERTa. Refer to the `documentation of RoBERTa <./roberta.html>`__ for usage
  examples as well as the information relative to the inputs and outputs.

+The original code can be found `here <https://camembert-model.fr/>`_.
+
 CamembertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -72,6 +77,13 @@ CamembertForTokenClassification
    :members:


+CamembertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertForQuestionAnswering
+    :members:
+
+
 TFCamembertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -98,3 +110,10 @@ TFCamembertForTokenClassification

 .. autoclass:: transformers.TFCamembertForTokenClassification
    :members:
+
+
+TFCamembertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCamembertForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@@ -1,6 +1,9 @@
 CTRL
 ----------------------------------------------------

+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 CTRL model was proposed in `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`_
 by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
@@ -31,6 +34,8 @@ Tips:
  See `reusing the past in generative models <../quickstart.html#using-the-past>`_ for more information on the usage
  of this argument.

+The original code can be found `here <https://github.com/salesforce/ctrl>`_.
+

 CTRLConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/dialogpt.rst
+++ b/docs/source/model_doc/dialogpt.rst
@@ -0,0 +1,39 @@
+DialoGPT
+----------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+DialoGPT was proposed in
+`DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`_
+by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+It's a GPT2 Model trained on 147M conversation-like exchanges extracted from Reddit.
+
+The abstract from the paper is the following:
+
+*We present a large, tunable neural conversational response generation model, DialoGPT (dialogue generative pre-trained transformer). 
+Trained on 147M conversation-like exchanges extracted from Reddit comment chains over a period spanning from 2005 through 2017, DialoGPT extends the Hugging Face PyTorch transformer to attain a performance close to human both in terms of automatic and human evaluation in single-turn dialogue settings.
+We show that conversational systems that leverage DialoGPT generate more relevant, contentful and context-consistent responses than strong baseline systems.
+The pre-trained model and training pipeline are publicly released to facilitate research into neural response generation and the development of more intelligent open-domain dialogue systems.*
+
+Tips:
+
+- DialoGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
+  the right rather than the left.
+- DialoGPT was trained with a causal language modeling (CLM) objective on conversational data and is therefore powerful at response generation in open-domain dialogue systems.
+- DialoGPT enables the user to create a chat bot in just 10 lines of code as shown on `DialoGPT's model card <https://huggingface.co/microsoft/DialoGPT-medium>`_.
+
+Training:
+
+In order to train or fine-tune DialoGPT, one can use causal language modeling training. 
+To cite the official paper: 
+*We follow the OpenAI GPT-2 to model a multiturn dialogue session 
+as a long text and frame the generation task as language modeling. We first
+concatenate all dialog turns within a dialogue session into a long text 
+x_1,..., x_N (N is the sequence length), ended by the end-of-text token.* 
+For more information please confer to the original paper.
+    
+
+DialoGPT's architecture is based on the GPT2 model, so one can refer to GPT2's `docstring <https://huggingface.co/transformers/model_doc/gpt2.html>`_.
+
+The original code can be found `here <https://github.com/microsoft/DialoGPT>`_.
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -1,6 +1,9 @@
 DistilBERT
 ----------------------------------------------------

+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The DistilBERT model was proposed in the blog post
 `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`__,
 and the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__.
@@ -27,6 +30,8 @@ Tips:
 - DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
 - DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.

+The original code can be found `here <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
+

 DistilBertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -42,6 +47,13 @@ DistilBertTokenizer
    :members:


+DistilBertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DistilBertTokenizerFast
+    :members:
+
+
 DistilBertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -63,6 +75,20 @@ DistilBertForSequenceClassification
    :members:


+DistilBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DistilBertForMultipleChoice
+    :members:
+
+
+DistilBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DistilBertForTokenClassification
+    :members:
+
+
 DistilBertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -90,6 +116,22 @@ TFDistilBertForSequenceClassification
    :members:


+
+TFDistilBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDistilBertForMultipleChoice
+    :members:
+
+
+
+TFDistilBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDistilBertForTokenClassification
+    :members:
+
+
 TFDistilBertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/electra.rst
+++ b/docs/source/model_doc/electra.rst
@@ -0,0 +1,148 @@
+ELECTRA
+----------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+The ELECTRA model was proposed in the paper.
+`ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators <https://openreview.net/pdf?id=r1xMH1BtvB>`__.
+ELECTRA is a new pre-training approach which trains two transformer models: the generator and the discriminator. The
+generator's role is to replace tokens in a sequence, and is therefore trained as a masked language model. The discriminator,
+which is the model we're interested in, tries to identify which tokens were replaced by the generator in the sequence.
+
+The abstract from the paper is the following:
+
+*Masked language modeling (MLM) pre-training methods such as BERT corrupt
+the input by replacing some tokens with [MASK] and then train a model to
+reconstruct the original tokens. While they produce good results when transferred
+to downstream NLP tasks, they generally require large amounts of compute to be
+effective. As an alternative, we propose a more sample-efficient pre-training task
+called replaced token detection. Instead of masking the input, our approach
+corrupts it by replacing some tokens with plausible alternatives sampled from a small
+generator network. Then, instead of training a model that predicts the original
+identities of the corrupted tokens, we train a discriminative model that predicts
+whether each token in the corrupted input was replaced by a generator sample
+or not. Thorough experiments demonstrate this new pre-training task is more
+efficient than MLM because the task is defined over all input tokens rather than
+just the small subset that was masked out. As a result, the contextual representations
+learned by our approach substantially outperform the ones learned by BERT
+given the same model size, data, and compute. The gains are particularly strong
+for small models; for example, we train a model on one GPU for 4 days that
+outperforms GPT (trained using 30x more compute) on the GLUE natural language
+understanding benchmark. Our approach also works well at scale, where it
+performs comparably to RoBERTa and XLNet while using less than 1/4 of their
+compute and outperforms them when using the same amount of compute.*
+
+Tips:
+
+- ELECTRA is the pre-training approach, therefore there is nearly no changes done to the underlying model: BERT. The
+  only change is the separation of the embedding size and the hidden size -> The embedding size is generally smaller,
+  while the hidden size is larger. An additional projection layer (linear) is used to project the embeddings from
+  their embedding size to the hidden size. In the case where the embedding size is the same as the hidden size, no
+  projection layer is used.
+- The ELECTRA checkpoints saved using `Google Research's implementation <https://github.com/google-research/electra>`__
+  contain both the generator and discriminator. The conversion script requires the user to name which model to export
+  into the correct architecture. Once converted to the HuggingFace format, these checkpoints may be loaded into all
+  available ELECTRA models, however. This means that the discriminator may be loaded in the `ElectraForMaskedLM` model,
+  and the generator may be loaded in the `ElectraForPreTraining` model (the classification head will be randomly
+  initialized as it doesn't exist in the generator).
+
+The original code can be found `here <https://github.com/google-research/electra>`_.
+
+
+ElectraConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraConfig
+    :members:
+
+
+ElectraTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraTokenizer
+    :members:
+
+
+ElectraTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraTokenizerFast
+    :members:
+
+
+ElectraModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraModel
+    :members:
+
+
+ElectraForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForPreTraining
+    :members:
+
+
+ElectraForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForMaskedLM
+    :members:
+
+
+ElectraForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForSequenceClassification
+    :members:
+
+
+ElectraForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForTokenClassification
+    :members:
+
+
+ElectraForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForQuestionAnswering
+    :members:
+
+
+TFElectraModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraModel
+    :members:
+
+
+TFElectraForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForPreTraining
+    :members:
+
+
+TFElectraForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForMaskedLM
+    :members:
+
+
+TFElectraForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForTokenClassification
+    :members:
+
+
+TFElectraForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/encoderdecoder.rst
+++ b/docs/source/model_doc/encoderdecoder.rst
@@ -0,0 +1,23 @@
+Encoder Decoder Models
+------------------------
+
+This class can wrap an encoder model, such as ``BertModel`` and a decoder modeling with a language modeling head, such as ``BertForMaskedLM`` into a encoder-decoder model.
+
+The ``EncoderDecoderModel`` class allows to instantiate a encoder decoder model using the ``from_encoder_decoder_pretrain`` class method taking a pretrained encoder and pretrained decoder model as an input. 
+The ``EncoderDecoderModel`` is saved using the standard ``save_pretrained()`` method and can also again be loaded using the standard ``from_pretrained()`` method. 
+
+An application of this architecture could be *summarization* using two pretrained Bert models as is shown in the paper: `Text Summarization with Pretrained Encoders <https://arxiv.org/abs/1910.13461>`_ by Yang Liu and Mirella Lapata. 
+
+
+``EncoderDecoderConfig``
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.EncoderDecoderConfig
+    :members:
+
+
+``EncoderDecoderModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.EncoderDecoderModel
+    :members:
--- a/docs/source/model_doc/flaubert.rst
+++ b/docs/source/model_doc/flaubert.rst
@@ -1,6 +1,9 @@
 FlauBERT
 ----------------------------------------------------

+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The FlauBERT model was proposed in the paper
 `FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le et al.
 It's a transformer pre-trained using a masked language modeling (MLM) objective (BERT-like).
@@ -20,6 +23,8 @@ of the time they outperform other pre-training approaches. Different versions of
 evaluation protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared
 to the research community for further reproducible experiments in French NLP.*

+The original code can be found `here <https://github.com/getalp/Flaubert>`_.
+

 FlaubertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -70,3 +75,43 @@ FlaubertForQuestionAnswering
    :members:


+TFFlaubertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertModel
+    :members:
+
+
+TFFlaubertWithLMHeadModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertWithLMHeadModel
+    :members:
+
+
+TFFlaubertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertForSequenceClassification
+    :members:
+
+
+TFFlaubertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertForMultipleChoice
+    :members:
+
+
+TFFlaubertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertForTokenClassification
+    :members:
+
+
+TFFlaubertForQuestionAnsweringSimple
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertForQuestionAnsweringSimple
+    :members:
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -36,6 +36,20 @@ Tips:
 `Write With Transformer <https://transformer.huggingface.co/doc/gpt>`__ is a webapp created and hosted by
 Hugging Face showcasing the generative capabilities of several models. GPT is one of them.

+The original code can be found `here <https://github.com/openai/finetune-transformer-lm>`_.
+
+Note:
+
+If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install 
+``ftfy`` and ``SpaCy``::
+
+    pip install spacy ftfy==4.4.3
+    python -m spacy download en
+
+If you don't install ``ftfy`` and ``SpaCy``, the :class:`transformers.OpenAIGPTTokenizer` will default to tokenize using 
+BERT's :obj:`BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't 
+worry).
+
 OpenAIGPTConfig
 ~~~~~~~~~~~~~~~~~~~~~

@@ -50,6 +64,13 @@ OpenAIGPTTokenizer
    :members: save_vocabulary


+OpenAIGPTTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.OpenAIGPTTokenizerFast
+    :members:
+
+
 OpenAIGPTModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -34,6 +34,8 @@ Tips:
 Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
 different sizes: small, medium, large, xl and a distilled version of the small checkpoint: distilgpt-2.

+The original code can be found `here <https://openai.com/blog/better-language-models/>`_.
+

 GPT2Config
 ~~~~~~~~~~~~~~~~~~~~~
@@ -49,6 +51,13 @@ GPT2Tokenizer
    :members: save_vocabulary


+GPT2TokenizerFast
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPT2TokenizerFast
+    :members:
+
+
 GPT2Model
 ~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/longformer.rst
+++ b/docs/source/model_doc/longformer.rst
@@ -0,0 +1,104 @@
+Longformer
+----------------------------------------------------
+**DISCLAIMER:** This model is still a work in progress, if you see something strange,
+file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`_
+
+Overview
+~~~~~~~~~
+The Longformer model was presented in `Longformer: The Long-Document Transformer <https://arxiv.org/pdf/2004.05150.pdf>`_ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+Here the abstract: 
+
+*Transformer-based models are unable to process long sequences due to their self-attention operation, which scales quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or longer. Longformer's attention mechanism is a drop-in replacement for the standard self-attention and combines a local windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on WikiHop and TriviaQA.*
+
+The Authors' code can be found `here <https://github.com/allenai/longformer>`_ .
+
+Longformer Self Attention
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+Longformer self attention employs self attention on both a "local" context and a "global" context.
+Most tokens only attend "locally" to each other meaning that each token attends to its :math:`\frac{1}{2} w` previous tokens and :math:`\frac{1}{2} w` succeding tokens with :math:`w` being the window length as defined in `config.attention_window`. Note that `config.attention_window` can be of type ``list`` to define a different :math:`w` for each layer. 
+A selecetd few tokens attend "globally" to all other tokens, as it is conventionally done for all tokens in *e.g.* `BertSelfAttention`.
+
+Note that "locally" and "globally" attending tokens are projected by different query, key and value matrices.
+Also note that every "locally" attending token not only attends to tokens within its window :math:`w`, but also to all "globally" attending tokens so that global attention is *symmetric*.
+
+The user can define which tokens attend "locally" and which tokens attend "globally" by setting the tensor `global_attention_mask` at run-time appropriately. `Longformer` employs the following logic for `global_attention_mask`: `0` - the token attends "locally", `1` - token attends "globally". For more information please also refer to :func:`~transformers.LongformerModel.forward` method.
+
+Using Longformer self attention, the memory and time complexity of the query-key matmul operation, which usually represents the memory and time bottleneck, can be reduced from :math:`\mathcal{O}(n_s \times n_s)` to :math:`\mathcal{O}(n_s \times w)`, with :math:`n_s` being the sequence length and :math:`w` being the average window size. It is assumed that the number of "globally" attending tokens is insignificant as compared to the number of "locally" attending tokens.
+
+For more information, please refer to the official `paper <https://arxiv.org/pdf/2004.05150.pdf>`_ .
+
+
+Training
+~~~~~~~~~~~~~~~~~~~~
+``LongformerForMaskedLM`` is trained the exact same way, ``RobertaForMaskedLM`` is trained and 
+should be used as follows:
+
+::
+
+  input_ids = tokenizer.encode('This is a sentence from [MASK] training data', return_tensors='pt')
+  mlm_labels = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
+
+  loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
+
+
+LongformerConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerConfig
+    :members:
+
+
+LongformerTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerTokenizer
+    :members: 
+
+
+LongformerTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerTokenizerFast
+    :members: 
+
+
+LongformerModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerModel
+    :members:
+
+
+LongformerForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForMaskedLM
+    :members:
+
+
+LongformerForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForSequenceClassification
+    :members:
+
+
+LongformerForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForMultipleChoice
+    :members:
+
+
+LongformerForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForTokenClassification
+    :members:
+
+
+LongformerForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -0,0 +1,111 @@
+MarianMT
+----------------------------------------------------
+**DISCLAIMER:** If you see something strange,
+file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
+@sshleifer. Translations should be similar, but not identical to, output in the test set linked to in each model card.
+
+Implementation Notes
+~~~~~~~~~~~~~~~~~~~~
+- Each model is about 298 MB on disk, there are 1,000+ models.
+- The list of supported language pairs can be found `here <https://huggingface.co/Helsinki-NLP>`__.
+- The 1,000+ models were originally trained by `Jörg Tiedemann <https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann>`__ using the `Marian <https://marian-nmt.github.io/>`_ C++ library, which supports fast training and translation.
+- All models are transformer encoder-decoders with 6 layers in each component. Each model's performance is documented in a model card.
+- The 80 opus models that require BPE preprocessing are not supported.
+- The modeling code is the same as ``BartForConditionalGeneration`` with a few minor modifications:
+    - static (sinusoid) positional embeddings (``MarianConfig.static_position_embeddings=True``)
+    - a new final_logits_bias (``MarianConfig.add_bias_logits=True``)
+    - no layernorm_embedding (``MarianConfig.normalize_embedding=False``)
+    - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix. (Bart uses <s/>)
+- Code to bulk convert models can be found in ``convert_marian_to_pytorch.py``
+
+Naming
+~~~~~~
+- All  model names use the following format: ``Helsinki-NLP/opus-mt-{src}-{tgt}``
+- The language codes used to name models are inconsistent. Two digit codes can usually be found `here <https://developers.google.com/admin-sdk/directory/v1/languages>`_, three digit codes require googling "language code {code}".
+- Codes formatted like ``es_AR`` are usually ``code_{region}``. That one is spanish documents from Argentina.
+
+
+Multilingual Models
+~~~~~~~~~~~~~~~~~~~~
+
+All  model names use the following format: ``Helsinki-NLP/opus-mt-{src}-{tgt}``:
+    - if ``src`` is in all caps, the model supports multiple input languages, you can figure out which ones by looking at the model card, or the Group Members `mapping <https://gist.github.com/sshleifer/6d20e7761931b08e73c3219027b97b8a>`_ .
+    - if ``tgt`` is in all caps, the model can output multiple languages, and you should specify a language code by prepending the desired output language to the src_text
+    - You can see a tokenizer's supported language codes in ``tokenizer.supported_language_codes``
+
+Example of translating english to many romance languages, using language codes:
+
+.. code-block:: python
+
+    from transformers import MarianMTModel, MarianTokenizer
+    src_text = [
+        '>>fr<< this is a sentence in english that we want to translate to french',
+        '>>pt<< This should go to portuguese',
+        '>>es<< And this to Spanish'
+    ]
+
+    model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
+    tokenizer = MarianTokenizer.from_pretrained(model_name)
+    print(tokenizer.supported_language_codes)
+    model = MarianMTModel.from_pretrained(model_name)
+    translated = model.generate(**tokenizer.prepare_translation_batch(src_text))
+    tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
+    # ["c'est une phrase en anglais que nous voulons traduire en français",
+    # 'Isto deve ir para o português.',
+    # 'Y esto al español']
+
+Sometimes, models were trained on collections of languages that do not resolve to a group. In this case, _ is used as a separator for src or tgt, as in ``'Helsinki-NLP/opus-mt-en_el_es_fi-en_el_es_fi'``. These still require language codes.
+There are many supported regional language codes, like ``>>es_ES<<`` (Spain) and ``>>es_AR<<`` (Argentina), that do not seem to change translations. I have not found these to provide different results than just using ``>>es<<``.
+
+For Example:
+    - ``Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU``: translates from all NORTH_EU languages (see `mapping <https://gist.github.com/sshleifer/6d20e7761931b08e73c3219027b97b8a>`_) to all NORTH_EU languages. Use a special language code like ``>>de<<`` to specify output language.
+    - ``Helsinki-NLP/opus-mt-ROMANCE-en``: translates from many romance languages to english, no codes needed since there is only 1 tgt language.
+
+
+
+.. code-block:: python
+
+    GROUP_MEMBERS = {
+     'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'],
+     'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'],
+     'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
+     'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
+     'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'],
+     'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'],
+     'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv']
+    }
+
+Code to see available pretrained models:
+
+.. code-block:: python
+
+    from transformers.hf_api import HfApi
+    model_list = HfApi().model_list()
+    org = "Helsinki-NLP"
+    model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
+    suffix = [x.split('/')[1] for x in model_ids]
+    multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]
+
+MarianConfig
+~~~~~~~~~~~~~~~~~~~
+.. autoclass:: transformers.MarianConfig
+    :members:
+
+
+MarianTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MarianTokenizer
+    :members: prepare_translation_batch
+
+
+MarianMTModel
+~~~~~~~~~~~~~
+
+Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints.
+Model API is identical to BartForConditionalGeneration.
+Available models are listed at `Model List <https://huggingface.co/models?search=Helsinki-NLP>`__
+This class inherits all functionality from ``BartForConditionalGeneration``, see that page for method signatures.
+
+.. autoclass:: transformers.MarianMTModel
+    :members:
--- a/docs/source/model_doc/mobilebert.rst
+++ b/docs/source/model_doc/mobilebert.rst
@@ -0,0 +1,169 @@
+MobileBERT
+----------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+The MobileBERT model was proposed in `MobileBERT: a Compact Task-Agnostic BERT
+for Resource-Limited Devices <https://arxiv.org/abs/2004.02984>`__
+by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou. It's a bidirectional transformer
+based on the BERT model, which is compressed and accelerated using several approaches.
+
+The abstract from the paper is the following:
+
+*Natural Language Processing (NLP) has recently achieved great success by using huge pre-trained models with hundreds
+of millions of parameters. However, these models suffer from heavy model sizes and high latency such that they cannot
+be deployed to resource-limited mobile devices. In this paper, we propose MobileBERT for compressing and accelerating
+the popular BERT model. Like the original BERT, MobileBERT is task-agnostic, that is, it can be generically applied
+to various downstream NLP tasks via simple fine-tuning. Basically, MobileBERT is a thin version of BERT_LARGE, while
+equipped with bottleneck structures and a carefully designed balance between self-attentions and feed-forward
+networks. To train MobileBERT, we first train a specially designed teacher model, an inverted-bottleneck incorporated
+BERT_LARGE model. Then, we conduct knowledge transfer from this teacher to MobileBERT. Empirical studies show that
+MobileBERT is 4.3x smaller and 5.5x faster than BERT_BASE while achieving competitive results on well-known
+benchmarks. On the natural language inference tasks of GLUE, MobileBERT achieves a GLUEscore o 77.7
+(0.6 lower than BERT_BASE), and 62 ms latency on a Pixel 4 phone. On the SQuAD v1.1/v2.0 question answering task,
+MobileBERT achieves a dev F1 score of 90.0/79.2 (1.5/2.1 higher than BERT_BASE).*
+
+Tips:
+
+- MobileBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
+  the right rather than the left.
+- MobileBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective.
+  It is therefore efficient at predicting masked tokens and at NLU in general, but is not optimal for
+  text generation. Models trained with a causal language modeling (CLM) objective are better in that regard.
+
+The original code can be found `here <https://github.com/google-research/mobilebert>`_.
+
+MobileBertConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertConfig
+    :members:
+
+
+MobileBertTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+MobileBertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertTokenizerFast
+    :members:
+
+
+MobileBertModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertModel
+    :members:
+
+
+MobileBertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertForPreTraining
+    :members:
+
+
+MobileBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertForMaskedLM
+    :members:
+
+
+MobileBertForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertForNextSentencePrediction
+    :members:
+
+
+MobileBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertForSequenceClassification
+    :members:
+
+
+MobileBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertForMultipleChoice
+    :members:
+
+
+MobileBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertForTokenClassification
+    :members:
+
+
+MobileBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertForQuestionAnswering
+    :members:
+
+
+TFMobileBertModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMobileBertModel
+    :members:
+
+
+TFMobileBertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMobileBertForPreTraining
+    :members:
+
+
+TFMobileBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMobileBertForMaskedLM
+    :members:
+
+
+TFMobileBertForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMobileBertForNextSentencePrediction
+    :members:
+
+
+TFMobileBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMobileBertForSequenceClassification
+    :members:
+
+
+TFMobileBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMobileBertForMultipleChoice
+    :members:
+
+
+TFMobileBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMobileBertForTokenClassification
+    :members:
+
+
+TFMobileBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMobileBertForQuestionAnswering
+    :members:
+
--- a/docs/source/model_doc/reformer.rst
+++ b/docs/source/model_doc/reformer.rst
@@ -0,0 +1,114 @@
+Reformer
+----------------------------------------------------
+**DISCLAIMER:** This model is still a work in progress, if you see something strange,
+file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`_
+
+Overview
+~~~~~~~~~~
+The Reformer model was presented in `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451.pdf>`_ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+Here the abstract: 
+
+*Large Transformer models routinely achieve state-of-the-art results on a number of tasks but training these models can be prohibitively costly, especially on long sequences. We introduce two techniques to improve the efficiency of Transformers. For one, we replace dot-product attention by one that uses locality-sensitive hashing, changing its complexity from O(L^2) to O(Llog(L)), where L is the length of the sequence. Furthermore, we use reversible residual layers instead of the standard residuals, which allows storing activations only once in the training process instead of N times, where N is the number of layers. The resulting model, the Reformer, performs on par with Transformer models while being much more memory-efficient and much faster on long sequences.*
+
+The Authors' code can be found `here <https://github.com/google/trax/tree/master/trax/models/reformer>`_ .
+
+Axial Positional Encodings
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Axial Positional Encodings were first implemented in Google's `trax library <https://github.com/google/trax/blob/4d99ad4965bab1deba227539758d59f0df0fef48/trax/layers/research/position_encodings.py#L29>`_ and developed by the authors of this model's paper. In models that are treating very long input sequences, the conventional position id encodings store an embedings vector of size :math:`d` being the ``config.hidden_size`` for every position :math:`i, \ldots, n_s`, with :math:`n_s` being ``config.max_embedding_size``. *E.g.*, having a sequence length of :math:`n_s = 2^{19} \approx 0.5M` and a ``config.hidden_size`` of :math:`d = 2^{10} \approx 1000` would result in a position encoding matrix:
+
+.. math::
+    X_{i,j}, \text{ with } i \in \left[1,\ldots, d\right] \text{ and } j \in \left[1,\ldots, n_s\right] 
+
+which alone has over 500M parameters to store. Axial positional encodings factorize :math:`X_{i,j}` into two matrices: 
+
+.. math::
+    X^{1}_{i,j}, \text{ with } i \in \left[1,\ldots, d^1\right] \text{ and } j \in \left[1,\ldots, n_s^1\right] 
+
+and 
+
+.. math::
+    X^{2}_{i,j}, \text{ with } i \in \left[1,\ldots, d^2\right] \text{ and } j \in \left[1,\ldots, n_s^2\right] 
+
+with:
+
+.. math::
+    d = d^1 + d^2 \text{ and } n_s = n_s^1 \times n_s^2 .
+
+Therefore the following holds:
+
+.. math::
+    X_{i,j} = \begin{cases}
+                X^{1}_{i, k}, & \text{if }\ i < d^1 \text{ with } k = j \mod n_s^1 \\
+                X^{2}_{i - d^1, l}, & \text{if } i \ge d^1 \text{ with } l = \lfloor\frac{j}{n_s^1}\rfloor
+              \end{cases}
+
+Intuitively, this means that a position embedding vector :math:`x_j \in \mathbb{R}^{d}` is now the composition of two factorized embedding vectors: :math:`x^1_{k, l} + x^2_{l, k}`, where as the ``config.max_embedding_size`` dimension :math:`j` is factorized into :math:`k \text{ and } l`.
+This design ensures that each position embedding vector :math:`x_j` is unique.
+
+Using the above example again, axial position encoding with :math:`d^1 = 2^5, d^2 = 2^5, n_s^1 = 2^9, n_s^2 = 2^{10}` can drastically reduced the number of parameters to :math:`2^{14} + 2^{15} \approx 49000` parameters.
+
+In practice, the parameter ``config.axial_pos_embds_dim`` is set to ``list``:math:`(d^1, d^2)` which sum has to be equal to ``config.hidden_size`` and ``config.axial_pos_shape`` is set to ``list``:math:`(n_s^1, n_s^2)` and which product has to be equal to ``config.max_embedding_size`` which during training has to be equal to the ``sequence length`` of the ``input_ids``.
+
+
+
+LSH Self Attention
+~~~~~~~~~~~~~~~~~~~~
+In Locality sensitive hashing (LSH) self attention the key and query projection weights are tied. Therefore, the key query embedding vectors are also tied.
+LSH self attention uses the locality sensitive 
+hashing mechanism proposed in `Practical and Optimal LSH for Angular Distance <https://arxiv.org/abs/1509.02897>`_ to assign each of the tied key query embedding vectors to one of ``config.num_buckets`` possible buckets. The premise is that the more "similar" key query embedding vectors (in terms of *cosine similarity*) are to each other, the more likely they are assigned to the same bucket. 
+The accuracy of the LSH mechanism can be improved by increasing ``config.num_hashes`` or directly the argument ``num_hashes`` of the forward function so that the output of the LSH self attention better approximates the output of the "normal" full self attention.
+The buckets are then sorted and chunked into query key embedding vector chunks each of length ``config.lsh_chunk_length``. For each chunk, the query embedding vectors attend to its key vectors (which are tied to themselves) and to the key embedding vectors of ``config.lsh_num_chunks_before`` previous neighboring chunks and ``config.lsh_num_chunks_after`` following neighboring chunks.
+For more information, see the `original Paper <https://arxiv.org/abs/2001.04451>`_ or this great `blog post <https://www.pragmatic.ml/reformer-deep-dive/>`_.
+
+Note that ``config.num_buckets`` can also be factorized into a ``list``:math:`(n_{\text{buckets}}^1, n_{\text{buckets}}^2)`. This way instead of assigning the query key embedding vectors to one of :math:`(1,\ldots, n_{\text{buckets}})` they are assigned to one of :math:`(1-1,\ldots, n_{\text{buckets}}^1-1, \ldots, 1-n_{\text{buckets}}^2, \ldots, n_{\text{buckets}}^1-n_{\text{buckets}}^2)`. This is crucial for very long sequences to save memory.
+
+When training a model from scratch, it is recommended to leave ``config.num_buckets=None``, so that depending on the sequence length a good value for ``num_buckets`` is calculated on the fly. This value will then automatically be saved in the config and should be reused for inference.
+
+Using LSH self attention, the memory and time complexity of the query-key matmul operation can be reduced from :math:`\mathcal{O}(n_s \times n_s)` to :math:`\mathcal{O}(n_s \times \log(n_s))`, which usually represents the memory and time bottleneck in a transformer model, with :math:`n_s` being the sequence length.
+
+
+Local Self Attention
+~~~~~~~~~~~~~~~~~~~~
+Local self attention is essentially a "normal" self attention layer with 
+key, query and value projections, but is chunked so that in each chunk of length ``config.local_chunk_length`` the query embedding vectors only attends to the key embedding vectors in its chunk and to the key embedding vectors of ``config.local_num_chunks_before`` previous neighboring chunks and ``config.local_num_chunks_after`` following neighboring chunks.
+
+Using Local self attention, the memory and time complexity of the query-key matmul operation can be reduced from :math:`\mathcal{O}(n_s \times n_s)` to :math:`\mathcal{O}(n_s \times \log(n_s))`, which usually represents the memory and time bottleneck in a transformer model, with :math:`n_s` being the sequence length.
+
+
+Training
+~~~~~~~~~~~~~~~~~~~~
+During training, we must ensure that the sequence length is set to a value that can be divided by the least common multiple of ``config.lsh_chunk_length`` and ``config.local_chunk_length`` and that the parameters of the Axial Positional Encodings are correctly set as described above. Reformer is very memory efficient so that the model can easily be trained on sequences as long as 64000 tokens.
+For training, the ``ReformerModelWithLMHead`` should be used as follows: 
+
+::
+
+  input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
+  loss = model(input_ids, labels=input_ids)[0]
+
+
+ReformerConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerConfig
+    :members:
+
+
+ReformerTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerTokenizer
+    :members: 
+
+
+ReformerModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerModel
+    :members:
+
+
+ReformerModelWithLMHead
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerModelWithLMHead
+    :members:
--- a/docs/source/model_doc/retribert.rst
+++ b/docs/source/model_doc/retribert.rst
@@ -0,0 +1,39 @@
+RetriBERT
+----------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+The RetriBERT model was proposed in the blog post
+`Explain Anything Like I'm Five: A Model for Open Domain Long Form Question Answering <https://yjernite.github.io/lfqa.html>`__,
+RetriBERT is a small model that uses either a single or pair of Bert encoders with lower-dimension projection for dense semantic indexing of text.
+
+Code to train and use the model can be found `here <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
+
+
+RetriBertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RetriBertConfig
+    :members:
+
+
+RetriBertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RetriBertTokenizer
+    :members:
+
+
+RetriBertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RetriBertTokenizerFast
+    :members:
+
+
+RetriBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RetriBertModel
+    :members:
--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@@ -1,6 +1,9 @@
 RoBERTa
 ----------------------------------------------------

+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_
 by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
 Veselin Stoyanov. It is based on Google's BERT model released in 2018.
@@ -28,6 +31,9 @@ Tips:
 - RoBERTa doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `</s>`)
 - `Camembert <./camembert.html>`__ is a wrapper around RoBERTa. Refer to this page for usage examples.

+The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_.
+
+
 RobertaConfig
 ~~~~~~~~~~~~~~~~~~~~~

@@ -43,6 +49,13 @@ RobertaTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+RobertaTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RobertaTokenizerFast
+    :members: build_inputs_with_special_tokens
+
+
 RobertaModel
 ~~~~~~~~~~~~~~~~~~~~

@@ -64,12 +77,27 @@ RobertaForSequenceClassification
    :members:


+RobertaForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RobertaForMultipleChoice
+    :members:
+
+
 RobertaForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RobertaForTokenClassification
    :members:

+
+RobertaForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RobertaForQuestionAnswering
+    :members:
+
+
 TFRobertaModel
 ~~~~~~~~~~~~~~~~~~~~

@@ -91,8 +119,22 @@ TFRobertaForSequenceClassification
    :members:


+TFRobertaForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRobertaForMultipleChoice
+    :members:
+
+
 TFRobertaForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFRobertaForTokenClassification
    :members:
+
+
+TFRobertaForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRobertaForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@@ -4,7 +4,8 @@ T5
 file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`_

 Overview
-~~~~~
+~~~~~~~~~~~~~~~~~~~~~
+
 The T5 model was presented in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer <https://arxiv.org/pdf/1910.10683.pdf>`_ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu in 
 Here the abstract: 

@@ -14,29 +15,41 @@ Our systematic study compares pre-training objectives, architectures, unlabeled
 By combining the insights from our exploration with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. 
 To facilitate future work on transfer learning for NLP, we release our dataset, pre-trained models, and code.*

-The Authors' code can be found `here <https://github.com/google-research/text-to-text-transfer-transformer>`_ .
+Tips:
+
+- T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised 
+  and supervised tasks and for which each task is converted into a text-to-text format.
+  T5 works well on a variety of tasks out-of-the-box by prepending a different prefix to the input corresponding to each task, e.g.: for translation: *translate English to German: ..., summarize: ...*.
+  For more information about which prefix to use, it is easiest to look into Appendix D of the `paper <https://arxiv.org/pdf/1910.10683.pdf>`_ .
+- For sequence to sequence generation, it is recommended to use ``T5ForConditionalGeneration.generate()``. The method takes care of feeding the encoded input via cross-attention layers to the decoder and auto-regressively generates the decoder output.
+- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
+
+The original code can be found `here <https://github.com/google-research/text-to-text-transfer-transformer>`_.

 Training
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~
+
 T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher forcing.
 This means that for training we always need an input sequence and a target sequence. 
-The input sequence is fed to the model using ``input_ids``. The target sequence is shifted to the right, *i.e.* perprended by a start-sequence token and fed to the decoder using the `decoder_input_ids`. In teacher-forcing style, the target sequence is then appended by the EOS token and corresponds to the ``lm_labels``. The PAD token is hereby used as the start-sequence token.
+The input sequence is fed to the model using ``input_ids``. The target sequence is shifted to the right, *i.e.* prepended by a start-sequence token and fed to the decoder using the `decoder_input_ids`. In teacher-forcing style, the target sequence is then appended by the EOS token and corresponds to the ``labels``. The PAD token is hereby used as the start-sequence token.
 T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.

 - Unsupervised denoising training
+
  In this setup spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) 
  and the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. 
-  Each sentinel tokens represents a unique mask token for this sentence and should start with ``<extra_id_1>``, ``<extrac_id_2>``, ... up to ``<extra_id_100>``. As a default 100 sentinel tokens are available in ``T5Tokenizer``.
+  Each sentinel token represents a unique mask token for this sentence and should start with ``<extra_id_1>``, ``<extra_id_2>``, ... up to ``<extra_id_100>``. As a default 100 sentinel tokens are available in ``T5Tokenizer``.
  *E.g.* the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be processed as follows: 

 ::

  input_ids = tokenizer.encode('The <extra_id_1> walks in <extra_id_2> park', return_tensors='pt')
-  lm_labels = tokenizer.encode('<extra_id_1> cute dog <extra_id_2> the <extra_id_3> </s>', return_tensors='pt')
+  labels = tokenizer.encode('<extra_id_1> cute dog <extra_id_2> the <extra_id_3> </s>', return_tensors='pt')
  # the forward function automatically creates the correct decoder_input_ids
-  model(input_ids=input_ids, lm_labels=lm_labels)
+  model(input_ids=input_ids, labels=labels)

 - Supervised training
+
  In this setup the input sequence and output sequence are standard sequence to sequence input output mapping.
  In translation, *e.g.* the input sequence "The house is wonderful." and output sequence "Das Haus ist wunderbar." should 
  be processed as follows:
@@ -44,18 +57,9 @@ T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
 ::

  input_ids = tokenizer.encode('translate English to German: The house is wonderful. </s>', return_tensors='pt')
-  lm_labels = tokenizer.encode('Das Haus ist wunderbar. </s>', return_tensors='pt')
+  labels = tokenizer.encode('Das Haus ist wunderbar. </s>', return_tensors='pt')
  # the forward function automatically creates the correct decoder_input_ids
-  model(input_ids=input_ids, lm_labels=lm_labels)
-
-Tips
-~~~~~~~~~~~~~~~~~~~~
- T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised 
-  and supervised tasks and for which each task is converted into a text-to-text format.
-  T5 works well on a variety of tasks out-of-the-box by prepending a different prefix to the input corresponding to each task, e.g.: for translation: *translate English to German: ..., summarize: ...*.
-  For more information about which prefix to use, it is easiest to look into Appendix D of the `paper <https://arxiv.org/pdf/1910.10683.pdf>`_ .
- For sequence to sequence generation, it is recommended to use ``T5ForConditionalGeneration.generate()``. The method takes care of feeding the encoded input via cross-attention layers to the decoder and auto-regressively generates the decoder output.
- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
+  model(input_ids=input_ids, labels=labels)


 T5Config
@@ -95,7 +99,7 @@ TFT5Model


 TFT5ForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFT5ForConditionalGeneration
    :members:
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -30,6 +30,8 @@ Tips:
  The original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
 - Transformer-XL is one of the few models that has no sequence length limit.

+The original code can be found `here <https://github.com/kimiyoung/transformer-xl>`_.
+

 TransfoXLConfig
 ~~~~~~~~~~~~~~~~~~~~~
@@ -45,6 +47,13 @@ TransfoXLTokenizer
    :members: save_vocabulary


+TransfoXLTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TransfoXLTokenizerFast
+    :members:
+
+
 TransfoXLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -30,6 +30,8 @@ Tips:
 - XLM has multilingual checkpoints which leverage a specific `lang` parameter. Check out the
  `multi-lingual <../multilingual.html>`__ page for more information.

+The original code can be found `here <https://github.com/facebookresearch/XLM/>`_.
+

 XLMConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -100,6 +102,21 @@ TFXLMForSequenceClassification
    :members:


+TFXLMForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMForMultipleChoice
+    :members:
+
+
+TFXLMForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMForTokenClassification
+    :members:
+
+
+
 TFXLMForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/xlmroberta.rst
+++ b/docs/source/model_doc/xlmroberta.rst
@@ -1,6 +1,9 @@
 XLM-RoBERTa
 ------------------------------------------

+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The XLM-RoBERTa model was proposed in `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__
 by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán,
 Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's RoBERTa model released in 2019.
@@ -28,6 +31,9 @@ Tips:
 - This implementation is the same as RoBERTa. Refer to the `documentation of RoBERTa <./roberta.html>`__ for usage
  examples as well as the information relative to the inputs and outputs.

+The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_.
+
+
 XLMRobertaConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -78,6 +84,13 @@ XLMRobertaForTokenClassification
    :members:


+XLMRobertaForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMRobertaForQuestionAnswering
+    :members:
+
+
 TFXLMRobertaModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -99,8 +112,22 @@ TFXLMRobertaForSequenceClassification
    :members:


+TFXLMRobertaForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMRobertaForMultipleChoice
+    :members:
+
+
 TFXLMRobertaForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFXLMRobertaForTokenClassification
    :members:
+
+
+TFXLMRobertaForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMRobertaForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -29,9 +29,11 @@ Tips:
  XLNet is pretrained using only a sub-set of the output tokens as target which are selected
  with the `target_mapping` input.
 - To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and
-  `target_mapping` inputs to control the attention span and outputs (see examples in `examples/run_generation.py`)
+  `target_mapping` inputs to control the attention span and outputs (see examples in `examples/text-generation/run_generation.py`)
 - XLNet is one of the few models that has no sequence length limit.

+The original code can be found `here <https://github.com/zihangdai/xlnet/>`_.
+

 XLNetConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -69,13 +71,6 @@ XLNetForSequenceClassification
    :members:


-XLNetForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetForTokenClassification
-    :members:
-
-
 XLNetForMultipleChoice
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -83,6 +78,13 @@ XLNetForMultipleChoice
    :members:


+XLNetForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLNetForTokenClassification
+    :members:
+
+
 XLNetForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -118,6 +120,20 @@ TFXLNetForSequenceClassification
    :members:


+TFLNetForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLNetForMultipleChoice
+    :members:
+
+
+TFXLNetForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLNetForTokenClassification
+    :members:
+
+
 TFXLNetForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_sharing.md
+++ b/docs/source/model_sharing.md
@@ -1,55 +0,0 @@
-# Model upload and sharing
-
-Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
-
-**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Optionally, join an existing organization or create a new one. Then:
-
-```shell
-transformers-cli login
-# log in using the same credentials as on huggingface.co
-```
-Upload your model:
-```shell
-transformers-cli upload ./path/to/pretrained_model/
-
-# ^^ Upload folder containing weights/tokenizer/config
-# saved via `.save_pretrained()`
-
-transformers-cli upload ./config.json [--filename folder/foobar.json]
-
-# ^^ Upload a single file
-# (you can optionally override its filename, which can be nested inside a folder)
-```
-
-If you want your model to be namespaced by your organization name rather than your username, add the following flag to any command:
-```shell
--organization organization_name
-```
-
-Your model will then be accessible through its identifier, a concatenation of your username (or organization name) and the folder name above:
-```python
-"username/pretrained_model"
-# or if an org:
-"organization_name/pretrained_model"
-```
-
-**Please add a README.md model card** to the repo under `model_cards/` with: model description, training params (dataset, preprocessing, hardware used, hyperparameters), evaluation results, intended uses & limitations, etc.
-
-Your model now has a page on huggingface.co/models 🔥
-
-Anyone can load it from code:
-```python
-tokenizer = AutoTokenizer.from_pretrained("namespace/pretrained_model")
-model = AutoModel.from_pretrained("namespace/pretrained_model")
-```
-
-List all your files on S3:
-```shell
-transformers-cli s3 ls
-```
-
-You can also delete unneeded files:
-
-```shell
-transformers-cli s3 rm …
-```
--- a/docs/source/model_sharing.rst
+++ b/docs/source/model_sharing.rst
@@ -0,0 +1,209 @@
+Model sharing and uploading
+===========================
+
+In this page, we will show you how to share a model you have trained or fine-tuned on new data with the community on
+the `model hub <https://huggingface.co/models>`__.
+
+.. note::
+
+    You will need to create an account on `huggingface.co <https://huggingface.co/join>`__ for this.
+
+    Optionally, you can join an existing organization or create a new one.
+
+Prepare your model for uploading
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We have seen in the :doc:`training tutorial <training>`: how to fine-tune a model on a given task. You have probably
+done something similar on your task, either using the model directly in your own training loop or using the
+:class:`~.transformers.Trainer`/:class:`~.transformers.TFTrainer` class. Let's see how you can share the result on
+the `model hub <https://huggingface.co/models>`__.
+
+Basic steps
+^^^^^^^^^^^
+
+.. 
+    When #5258 is merged, we can remove the need to create the directory.
+
+First, pick a directory with the name you want your model to have on the model hub (its full name will then be
+`username/awesome-name-you-picked` or `organization/awesome-name-you-picked`) and create it with either
+
+::
+
+    mkdir path/to/awesome-name-you-picked
+
+or in python
+
+::
+
+    import os
+    os.makedirs("path/to/awesome-name-you-picked")
+
+then you can save your model and tokenizer with:
+
+::
+
+    model.save_pretrained("path/to/awesome-name-you-picked")
+    tokenizer.save_pretrained("path/to/awesome-name-you-picked")
+
+Or, if you're using the Trainer API
+
+::
+
+    trainer.save_model("path/to/awesome-name-you-picked")
+    tokenizer.save_pretrained("path/to/awesome-name-you-picked")
+
+Make your model work on all frameworks
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. 
+    TODO Sylvain: make this automatic during the upload
+
+You probably have your favorite framework, but so will other users! That's why it's best to upload your model with both
+PyTorch `and` TensorFlow checkpoints to make it easier to use (if you skip this step, users will still be able to load
+your model in another framework, but it will be slower, as it will have to be converted on the fly). Don't worry, it's super easy to do (and in a future version,
+it will all be automatic). You will need to install both PyTorch and TensorFlow for this step, but you don't need to
+worry about the GPU, so it should be very easy. Check the
+`TensorFlow installation page <https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available>`__ 
+and/or the `PyTorch installation page <https://pytorch.org/get-started/locally/#start-locally>`__ to see how.
+
+First check that your model class exists in the other framework, that is try to import the same model by either adding
+or removing TF. For instance, if you trained a :class:`~transformers.DistilBertForSequenceClassification`, try to
+type
+
+::
+
+    from transformers import TFDistilBertForSequenceClassification
+
+and if you trained a :class:`~transformers.TFDistilBertForSequenceClassification`, try to
+type
+
+::
+
+    from transformers import DistilBertForSequenceClassification
+
+This will give back an error if your model does not exist in the other framework (something that should be pretty rare
+since we're aiming for full parity between the two frameworks). In this case, skip this and go to the next step.
+
+Now, if you trained your model in PyTorch and have to create a TensorFlow version, adapt the following code to your
+model class:
+
+::
+
+    tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
+    tf_model.save_pretrained("path/to/awesome-name-you-picked")
+
+and if you trained your model in TensorFlow and have to create a PyTorch version, adapt the following code to your
+model class:
+
+::
+
+    pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
+    pt_model.save_pretrained("path/to/awesome-name-you-picked")
+
+That's all there is to it!
+
+Check the directory before uploading
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Make sure there are no garbage files in the directory you'll upload. It should only have:
+
+- a `config.json` file, which saves the :doc:`configuration <main_classes/configuration>` of your model ;
+- a `pytorch_model.bin` file, which is the PyTorch checkpoint (unless you can't have it for some reason) ;
+- a `tf_model.h5` file, which is the TensorFlow checkpoint (unless you can't have it for some reason) ;
+- a `special_tokens_map.json`, which is part of your :doc:`tokenizer <main_classes/tokenizer>` save;
+- a `tokenizer_config.json`, which is part of your :doc:`tokenizer <main_classes/tokenizer>` save;
+- a `vocab.txt`, which is the vocabulary of your tokenizer, part of your :doc:`tokenizer <main_classes/tokenizer>`
+  save;
+- maybe a `added_tokens.json`, which is part of your :doc:`tokenizer <main_classes/tokenizer>` save.
+
+Other files can safely be deleted.
+
+Upload your model with the CLI
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now go in a terminal and run the following command. It should be in the virtual enviromnent where you installed 🤗
+Transformers, since that command :obj:`transformers-cli` comes from the library.
+
+::
+
+    transformers-cli login
+
+Then log in using the same credentials as on huggingface.co. To upload your model, just type
+
+::
+
+    transformers-cli upload path/to/awesome-name-you-picked/
+
+This will upload the folder containing the weights, tokenizer and configuration we prepared in the previous section.
+
+If you want to upload a single file (a new version of your model, or the other framework checkpoint you want to add),
+just type:
+
+::
+
+    transformers-cli upload path/to/awesome-name-you-picked/that-file 
+
+or
+
+::
+
+   transformers-cli upload path/to/awesome-name-you-picked/that-file --filename awesome-name-you-picked/new_name
+
+if you want to change its filename.
+
+This uploads the model to your personal account. If you want your model to be namespaced by your organization name
+rather than your username, add the following flag to any command:
+
+::
+
+    --organization organization_name
+
+so for instance:
+
+::
+
+    transformers-cli upload path/to/awesome-name-you-picked/ --organization organization_name
+
+Your model will then be accessible through its identifier, which is, as we saw above,
+`username/awesome-name-you-picked` or `organization/awesome-name-you-picked`.
+
+Add a model card
+^^^^^^^^^^^^^^^^
+
+To make sure everyone knows what your model can do, what its limitations and potential bias or ethetical
+considerations, please add a README.md model card to the 🤗 Transformers repo under `model_cards/`. It should be named
+`README.md` and follow `this template <https://github.com/huggingface/model_card>`__.
+
+If your model is fine-tuned from another model coming from the model hub (all 🤗 Transformers pretrained models do),
+don't forget to link to its model card so that people can fully trace how your model was built.
+
+If you have never made a pull request to the 🤗 Transformers repo, look at the
+:doc:`contributing guide <contributing>` to see the steps to follow.
+
+Using your model
+^^^^^^^^^^^^^^^^
+
+Your model now has a page on huggingface.co/models 🔥
+
+Anyone can load it from code:
+
+::
+
+    tokenizer = AutoTokenizer.from_pretrained("namespace/awesome-name-you-picked")
+    model = AutoModel.from_pretrained("namespace/awesome-name-you-picked")
+
+Additional commands
+^^^^^^^^^^^^^^^^^^^
+
+You can list all the files you uploaded on the hub like this:
+
+::
+
+    transformers-cli s3 ls
+
+You can also delete unneeded files with
+
+::
+
+    transformers-cli s3 rm awesome-name-you-picked/filename
+
--- a/docs/source/model_summary.rst
+++ b/docs/source/model_summary.rst
@@ -0,0 +1,618 @@
+Summary of the models
+================================================
+
+This is a summary of the models available in 🤗 Transformers. It assumes you’re familiar with the original 
+`transformer model <https://arxiv.org/abs/1706.03762>`_. For a gentle introduction check the `annotated transformer 
+<http://nlp.seas.harvard.edu/2018/04/03/attention.html>`_. Here we focus on the high-level differences between the
+models. You can check them more in detail in their respective documentation. Also checkout the 
+:doc:`pretrained model page </pretrained_models>` to see the checkpoints available for each type of model and all `the 
+community models <https://huggingface.co/models>`_.
+
+Each one of the models in the library falls into one of the following categories:
+
+  * :ref:`autoregressive-models`
+  * :ref:`autoencoding-models`
+  * :ref:`seq-to-seq-models`
+  * :ref:`multimodal-models`
+
+Autoregressive models are pretrained on the classic language modeling task: guess the next token having read all the 
+previous ones. They correspond to the decoder of the original transformer model, and a mask is used on top of the full 
+sentence so that the attention heads can only see what was before in the next, and not what’s after. Although those 
+models can be fine-tuned and achieve great results on many tasks, the most natural application is text generation. 
+A typical example of such models is GPT.
+
+Autoencoding models are pretrained by corrupting the input tokens in some way and trying to reconstruct the original 
+sentence. They correspond to the encoder of the original transformer model in the sense that they get access to the 
+full inputs without any mask. Those models usually build a bidirectional representation of the whole sentence. They can 
+be fine-tuned and achieve great results on many tasks such as text generation, but their most natural application is 
+sentence classification or token classification. A typical example of such models is BERT.
+
+Note that the only difference between autoregressive models and autoencoding models is in the way the model is 
+pretrained. Therefore, the same architecture can be used for both autoregressive and autoencoding models. When a given
+model has been used for both pretraining, we have put it in the category corresponding to the article it was first
+introduced.
+
+Sequence-to-sequence models use both the encoder and the decoder of the original transformer, either for translation 
+tasks or by transforming other tasks to sequence-to-sequence problems. They can be fine-tuned to many tasks but their 
+most natural applications are translation, summarization and question answering. The original transformer model is an 
+example of such a model (only for translation), T5 is an example that can be fine-tuned on other tasks.
+
+Multimodal models mix text inputs with other kinds (like image) and are more specific to a given task.
+
+.. _autoregressive-models:
+
+Autoregressive models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+As mentioned before, these models rely on the decoder part of the original transformer and use an attention mask so 
+that at each position, the model can only look at the tokens before in the attention heads.
+
+Original GPT
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=openai-gpt">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-openai--gpt-blueviolet">
+   </a>
+   <a href="/model_doc/gpt">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-openai--gpt-blueviolet">
+   </a>
+
+`Improving Language Understanding by Generative Pre-Training <https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf>`_, 
+Alec Radford et al.
+
+The first autoregressive model based on the transformer architecture, pretrained on the Book Corpus dataset.
+
+The library provides versions of the model for language modeling and multitask language modeling/multiple choice 
+classification.
+
+GPT-2
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=gpt2">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-gpt2-blueviolet">
+   </a>
+   <a href="/model_doc/gpt2">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-gpt2-blueviolet">
+   </a>
+
+`Language Models are Unsupervised Multitask Learners <https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`_, 
+Alec Radford et al.
+
+A bigger and better version of GPT, pretrained on WebText (web pages from outgoing links in Reddit with 3 karmas or 
+more).
+
+The library provides versions of the model for language modeling and multitask language modeling/multiple choice 
+classification.
+
+CTRL
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=ctrl">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-ctrl-blueviolet">
+   </a>
+   <a href="/model_doc/ctrl">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-ctrl-blueviolet">
+   </a>
+
+`CTRL: A Conditional Transformer Language Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`_, 
+Nitish Shirish Keskar et al.
+
+Same as the GPT model but adds the idea of control codes. Text is generated from a prompt (can be empty) and one (or 
+several) of those control codes which are then used to influence the text generation: generate with the style of 
+wikipedia article, a book or a movie review.
+
+The library provides a version of the model for language modeling only.
+
+Transformer-XL
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=transfo-xl">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-transfo--xl-blueviolet">
+   </a>
+   <a href="/model_doc/transformerxl">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-transfo--xl-blueviolet">
+   </a>
+
+`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_, 
+Zihang Dai et al.
+
+Same as a regular GPT model, but introduces a recurrence mechanism for two consecutive segments (similar to a regular 
+RNNs with two consecutive inputs). In this context, a segment is a number of consecutive tokens (for instance 512) that 
+may span across multiple documents, and segments are fed in order to the model.
+
+Basically, the hidden states of the previous segment are concatenated to the current input to compute the attention 
+scores. This allows the model to pay attention to information that was in the previous segment as well as the current 
+one. By stacking multiple attention layers, the receptive field can be increased to multiple previous segments.
+
+This changes the positional embeddings to positional relative embeddings (as the regular positional embeddings would 
+give the same results in the current input and the current hidden state at a given position) and needs to make some 
+adjustments in the way attention scores are computed.
+
+The library provides a version of the model for language modeling only.
+
+.. _reformer:
+
+Reformer
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=reformer">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-reformer-blueviolet">
+   </a>
+   <a href="/model_doc/reformer">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-reformer-blueviolet">
+   </a>
+
+`Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_,
+Nikita Kitaev et al .
+
+An autoregressive transformer model with lots of tricks to reduce memory footprint and compute time. Those tricks 
+include:
+
+  * Use :ref:`Axial position encoding <axial-pos-encoding>` (see below for more details). It’s a mechanism to avoid 
+    having a huge positional encoding matrix (when the sequence length is very big) by factorizing it in smaller 
+    matrices.
+  * Replace traditional attention by :ref:`LSH (local-sensitive hashing) attention <lsh-attention>` (see below for more 
+    details). It's a technique to avoid compute the full product query-key in the attention layers.
+  * Avoid storing the intermediate results of each layer by using reversible transformer layers to obtain them during 
+    the backward pass (subtracting the residuals from the input of the next layer gives them back) or recomputing them 
+    for results inside a given layer (less efficient than storing them but saves memory).
+  * Compute the feedforward operations by chunks and not on the whole batch.
+
+With those tricks, the model can be fed much larger sentences than traditional transformer autoregressive models.
+
+**Note:** This model could be very well be used in an autoencoding setting, there is no checkpoint for such a
+pretraining yet, though.
+
+The library provides a version of the model for language modeling only.
+
+XLNet
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=xlnet">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlnet-blueviolet">
+   </a>
+   <a href="/model_doc/xlnet">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xlnet-blueviolet">
+   </a>
+
+`XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_,
+Zhilin Yang et al.
+
+XLNet is not a traditional autoregressive model but uses a training strategy that builds on that. It permutes the 
+tokens in the sentence, then allows the model to use the last n tokens to predict the token n+1. Since this is all done 
+with a mask, the sentence is actually fed in the model in the right order, but instead of masking the first n tokens 
+for n+1, XLNet uses a mask that hides the previous tokens in some given permutation of 1,...,sequence length.
+
+XLNet also uses the same recurrence mechanism as TransformerXL to build long-term dependencies. 
+
+The library provides a version of the model for language modeling, token classification, sentence classification, 
+multiple choice classification and question answering.
+
+.. _autoencoding-models:
+
+Autoencoding models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+As mentioned before, these models rely on the encoder part of the original transformer and use no mask so the model can
+look at all the tokens in the attention heads. For pretraining, inputs are a corrupted version of the sentence, usually 
+obtained by masking tokens, and targets are the original sentences.
+
+BERT
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=bert">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-bert-blueviolet">
+   </a>
+   <a href="/model_doc/bert">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-bert-blueviolet">
+   </a>
+
+`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_,
+Jacob Devlin et al.
+
+Corrupts the inputs by using random masking, more precisely, during pretraining, a given percentage of tokens (usually 
+15%) are masked by
+ 
+  * a special mask token with probability 0.8
+  * a random token different from the one masked with probability 0.1
+  * the same token with probability 0.1
+
+The model must predict the original sentence, but has a second objective: inputs are two sentences A and B (with a 
+separation token in between). With probability 50%, the sentences are consecutive in the corpus, in the remaining 50% 
+they are not related. The model has to predict if the sentences are consecutive or not.
+
+The library provides a version of the model for language modeling (traditional or masked), next sentence prediction, 
+token classification, sentence classification, multiple choice classification and question answering.
+
+ALBERT
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=albert">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-albert-blueviolet">
+   </a>
+   <a href="/model_doc/albert">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-albert-blueviolet">
+   </a>
+
+`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_,
+Zhenzhong Lan et al.
+
+Same as BERT but with a few tweaks:
+
+  * Embedding size E is different from hidden size H justified because the embeddings are context independent (one 
+    embedding vector represents one token) whereas hidden states are context dependent (one hidden state represents a 
+    sequence of tokens) so it's more logical to have H >> E. Als, the embedding matrix is large since it's V x E (V 
+    being the vocab size). If E < H, it has less parameters.
+  * Layers are split in groups that share parameters (to save memory).
+  * Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A et B 
+    (that are consecutive) and we either feed A followed by B or B followed by A. The model must predict if they have 
+    been swapped or not.
+
+The library provides a version of the model for masked language modeling, token classification, sentence 
+classification, multiple choice classification and question answering.
+
+RoBERTa
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=roberta">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-roberta-blueviolet">
+   </a>
+   <a href="/model_doc/roberta">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-roberta-blueviolet">
+   </a>
+
+`RoBERTa: A Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_,
+Yinhan Liu et al.
+
+Same as BERT with better pretraining tricks:
+
+  * dynamic masking: tokens are masked differently at each epoch whereas BERT does it once and for all
+  * no NSP (next sentence prediction) loss and instead of putting just two sentences together, put a chunk of 
+    contiguous texts together to reach 512 tokens (so sentences in in an order than may span other several documents)
+  * train with larger batches
+  * use BPE with bytes as a subunit and not characters (because of unicode characters)
+
+The library provides a version of the model for masked language modeling, token classification, sentence 
+classification, multiple choice classification and question answering.
+
+DistilBERT
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=distilbert">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-distilbert-blueviolet">
+   </a>
+   <a href="/model_doc/distilbert">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-distilbert-blueviolet">
+   </a>
+
+`DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_,
+Victor Sanh et al.
+
+Same as BERT but smaller. Trained by distillation of the pretrained BERT model, meaning it's been trained to predict 
+the same probabilities as the larger model. The actual objective is a combination of:
+
+  * finding the same probabilities as the teacher model
+  * predicting the masked tokens correctly (but no next-sentence objective)
+  * a cosine similarity between the hidden states of the student and the teacher model
+
+The library provides a version of the model for masked language modeling, token classification, sentence classification 
+and question answering.
+
+XLM
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=xlm">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlm-blueviolet">
+   </a>
+   <a href="/model_doc/xlm">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xlm-blueviolet">
+   </a>
+
+`Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_, Guillaume Lample and Alexis Conneau
+
+A transformer model trained on several languages. There are three different type of training for this model and the 
+library provides checkpoints for all of them:
+
+  * Causal language modeling (CLM) which is the traditional autoregressive training (so this model could be in the 
+    previous section as well). One of the languages is selected for each training sample, and the model input is a 
+    sentence of 256 tokens that may span on several documents in one one those languages.
+  * Masked language modeling (MLM) which is like RoBERTa. One of the languages is selected for each training sample, 
+    and the model input is a sentence of 256 tokens that may span on several documents in one one those languages, with
+    dynamic masking of the tokens.
+  * A combination of MLM and translation language modeling (TLM). This consists of concatenating a sentence in two 
+    different languages, with random masking. To predict one of the masked token, the model can use both the 
+    surrounding context in language 1 as well as the context given by language 2.
+
+Checkpoints refer to which method was used for pretraining by having `clm`, `mlm` or `mlm-tlm` in their names. On top
+of positional embeddings, the model has language embeddings. When training using MLM/CLM, this gives the model an
+indication of the language used, and when training using MLM+TLM, an indication of which part of the input is in which
+language.
+
+The library provides a version of the model for language modeling, token classification, sentence classification and 
+question answering.
+
+XLM-RoBERTa
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=xlm-roberta">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlm--roberta-blueviolet">
+   </a>
+   <a href="/model_doc/xlmroberta">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xlm--roberta-blueviolet">
+   </a>
+
+`Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_, Alexis Conneau et 
+al.
+
+Uses RoBERTa tricks on the XLM approach, but does not use the translation language modeling objective, only using 
+masked language modeling on sentences coming from one language. However, the model is trained on many more languages 
+(100) and doesn't use the language embeddings, so it's capable of detecting the input language by itself.
+
+The library provides a version of the model for masked language modeling, token classification, sentence 
+classification, multiple choice classification and question answering.
+
+FlauBERT
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=flaubert">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-flaubert-blueviolet">
+   </a>
+   <a href="/model_doc/flaubert">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-flaubert-blueviolet">
+   </a>
+
+`FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_, Hang Le et al.
+
+Like RoBERTa, without the sentence ordering prediction (so just trained on the MLM objective).
+
+The library provides a version of the model for language modeling and sentence classification.
+
+ELECTRA
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=electra">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-electra-blueviolet">
+   </a>
+   <a href="/model_doc/electra">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-electra-blueviolet">
+   </a>
+
+`ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators <https://arxiv.org/abs/2003.10555>`_, 
+Kevin Clark et al.
+
+ELECTRA is a transformer model pretrained with the use of another (small) masked language model. The inputs are 
+corrupted by that language model, which takes an input text that is randomly masked and outputs a text in which ELECTRA 
+has to predict which token is an original and which one has been replaced. Like for GAN training, the small language 
+model is trained for a few steps (but with the original texts as objective, not to fool the ELECTRA model like in a 
+traditional GAN setting) then the ELECTRA model is trained for a few steps.
+
+The library provides a version of the model for masked language modeling, token classification and sentence 
+classification.
+
+.. _longformer:
+
+Longformer
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=longformer">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-longformer-blueviolet">
+   </a>
+   <a href="/model_doc/longformer">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-longformer-blueviolet">
+   </a>
+
+`Longformer: The Long-Document Transformer <https://arxiv.org/abs/2004.05150>`_, Iz Beltagy et al.
+
+A transformer model replacing the attention matrices by sparse matrices to go faster. Often, the local context (e.g., 
+what are the two tokens left and right?) is enough to take action for a given token. Some preselected input tokens are 
+still given global attention, but the attention matrix has way less parameters, resulting in a speed-up. See the 
+:ref:`local attention section <local-attention>` for more information.
+
+It is pretrained the same way a RoBERTa otherwise.
+
+**Note:** This model could be very well be used in an autoregressive setting, there is no checkpoint for such a
+pretraining yet, though.
+
+The library provides a version of the model for masked language modeling, token classification, sentence 
+classification, multiple choice classification and question answering.
+
+.. _seq-to-seq-models:
+
+Sequence-to-sequence models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+As mentioned before, these models keep both the encoder and the decoder of the original transformer.
+
+BART
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=bart">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-bart-blueviolet">
+   </a>
+   <a href="/model_doc/bart">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-bart-blueviolet">
+   </a>
+
+`BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension 
+<https://arxiv.org/abs/1910.13461>`_, Mike Lewis et al.
+
+Sequence-to-sequence model with an encoder and a decoder. Encoder is fed a corrupted version of the tokens, decoder is 
+fed the tokens (but has a mask to hide the future words like a regular transformers decoder). For the encoder, on the 
+pretraining tasks, a composition of the following transformations are applied:
+
+  * mask random tokens (like in BERT)
+  * delete random tokens
+  * mask a span of k tokens with a single mask token (a span of 0 tokens is an insertion of a mask token)
+  * permute sentences
+  * rotate the document to make it start by a specific token
+
+The library provides a version of this model for conditional generation and sequence classification.
+
+MarianMT
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=marian">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-marian-blueviolet">
+   </a>
+   <a href="/model_doc/marian">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-marian-blueviolet">
+   </a>
+
+`Marian: Fast Neural Machine Translation in C++ <https://arxiv.org/abs/1804.00344>`_, Marcin Junczys-Dowmunt et al.
+
+A framework for translation models, using the same models as BART
+
+The library provides a version of this model for conditional generation.
+
+T5
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=t5">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-t5-blueviolet">
+   </a>
+   <a href="/model_doc/t5">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-t5-blueviolet">
+   </a>
+
+`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`_, 
+Colin Raffel et al.
+
+Uses the traditional transformer model (except a slight change with the positional embeddings, which are learned at 
+each layer). To be able to operate on all NLP tasks, it transforms them in text-to-text problems by using certain 
+prefixes: “Summarize: …”, “question: …”, “translate English to German: …” and so forth.
+
+The pretraining includes both supervised and self-supervised training. Supervised training is conducted on downstream 
+tasks provided by the GLUE and SuperGLUE benchmarks (changing them to text-to-text tasks as explained above).
+
+Self-supervised training consists of corrupted pretrained, which means randomly removing 15% of the tokens and 
+replacing them by individual sentinel tokens (if several consecutive tokens are marked for removal, they are replaced 
+by one single sentinel token). The input of the encoder is the corrupted sentence, the input of the decoder the 
+original sentence and the target is then the dropped out tokens delimited by their sentinel tokens.
+
+For instance, if we have the sentence “My dog is very cute .”, and we decide to remove the token dog, is and cute, the 
+input becomes “My <x> very <y> .” and the target is “<x> dog is <y> . <z>”
+
+The library provides a version of this model for conditional generation.
+
+.. _multimodal-models:
+
+Multimodal models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There is one multimodal model in the library which has not been pretrained in the self-supervised fashion like the 
+others.
+
+MMBT
+----------------------------------------------
+
+`Supervised Multimodal Bitransformers for Classifying Images and Text <https://arxiv.org/abs/1909.02950>`_, Douwe Kiela 
+et al.
+
+A transformers model used in multimodal settings, combining a text and an image to make predictions. The transformer 
+model takes as inputs the embeddings of the tokenized text and a the final activations of a pretrained resnet on the 
+images (after the pooling layer) that goes through a linear layer (to go from number of features at the end of the 
+resnet to the hidden state dimension of the transformer).
+
+The different inputs are concatenated, and on top of the positional embeddings, a segment embedding is added to let the 
+model know which part of the input vector corresponds to the text or the image.
+
+The pretrained model only works for classification.
+
+..
+    More information in this :doc:`model documentation </model_doc/mmbt>`.
+    TODO: write this page
+
+More technical aspects
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Full vs sparse attention
+----------------------------------------------
+
+Most transformer models use full attention in the sense that the attention matrix is square. It can be a big 
+computational bottleneck when you have long texts. Longformer and reformer are models that try to be more efficient and 
+use a sparse version of the attention matrix to speed up training.
+
+.. _lsh-attention:
+
+**LSH attention**
+
+:ref:`Reformer <reformer>` uses LSH attention. In the softmax(QK^t), only the biggest elements (in the softmax 
+dimension) of the matrix QK^t are going to give useful contributions. So for each query q in Q, we can only consider 
+the keys k in K that are close to q. A hash function is used to determine if q and k are close. The attention mask is 
+modified to mask the current token (except at the first position) because it will give a query and key equal (so very 
+similar to each other). Since the hash can be a bit random, several hash functions are used in practice (determined by 
+a n_rounds parameter) then are averaged together.
+
+.. _local-attention:
+
+**Local attention**
+
+:ref:`Longformer <longformer>` uses local attention: often, the local context (e.g., what are the two tokens left and 
+right?) is enough to take action for a given token. Also, by stacking attention layers that have a small window, the 
+last layer will have a receptive field of more than just the tokens on the window, allowing them to build a 
+representation of the whole sentence.
+
+Some preselected input tokens are also given global attention: for those few tokens, the attention matrix can access 
+all tokens and this process is symmetric: all other tokens have access to those specific tokens (on top of the ones in 
+their local window). This is shown in Figure 2d of the paper, see below for a sample attention mask:
+
+.. image:: imgs/local_attention_mask.png
+   :scale: 50 %
+   :align: center
+
+Using those attention matrices with less parameters then allows the model to have inputs having a bigger sequence 
+length.
+
+Other tricks
+----------------------------------------------
+
+.. _axial-pos-encoding:
+
+**Axial positional encodings**
+
+:ref:`Reformer <reformer>` uses axial positional encodings: in traditional transformer models, the positional encoding 
+E is a matrix of size :math:`l` by :math:`d`, :math:`l` being the sequence length and :math:`d` the dimension of the 
+hidden state. If you have very long texts, this matrix can be huge and take way too much space on the GPU.
+
+To alleviate that, axial positional encodings consists in factorizing that big matrix E in two smaller matrices E1 and 
+E2, with dimensions :math:`l_{1} \times d_{1}` and :math:`l_{2} \times d_{2}`, such that :math:`l_{1} \times l_{2} = l`
+and :math:`d_{1} + d_{2} = d` (with the product for the lengths, this ends up being way smaller). The embedding for 
+time step :math:`j` in E is obtained by concatenating the embeddings for timestep :math:`j \% l1` in E1 and 
+:math:`j // l1` in E2.
+
--- a/docs/source/multilingual.rst
+++ b/docs/source/multilingual.rst
@@ -36,10 +36,11 @@ Here is an example using the ``xlm-clm-enfr-1024`` checkpoint (Causal language m

 .. code-block::

-    import torch
-    from transformers import XLMTokenizer, XLMWithLMHeadModel
+    >>> import torch
+    >>> from transformers import XLMTokenizer, XLMWithLMHeadModel

-    tokenizer = XLMTokenizer.from_pretrained("xlm-clm-1024-enfr")
+    >>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
+    >>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")


 The different languages this model/tokenizer handles, as well as the ids of these languages are visible using the
@@ -47,16 +48,15 @@ The different languages this model/tokenizer handles, as well as the ids of thes

 .. code-block::

-    # Continuation of the previous script
-    print(tokenizer.lang2id)  # {'en': 0, 'fr': 1}
+    >>> print(tokenizer.lang2id)
+    {'en': 0, 'fr': 1}


 These ids should be used when passing a language parameter during a model pass. Let's define our inputs:

 .. code-block::

-    # Continuation of the previous script
-    input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1
+    >>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1


 We should now define the language embedding by using the previously defined language id. We want to create a tensor
@@ -64,23 +64,21 @@ filled with the appropriate language ids, of the same size as input_ids. For eng

 .. code-block::

-    # Continuation of the previous script
-    language_id = tokenizer.lang2id['en']  # 0
-    langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
+    >>> language_id = tokenizer.lang2id['en']  # 0
+    >>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])

-    # We reshape it to be of size (batch_size, sequence_length)
-    langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)
+    >>> # We reshape it to be of size (batch_size, sequence_length)
+    >>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)


 You can then feed it all as input to your model:

 .. code-block::

-    # Continuation of the previous script
-    outputs = model(input_ids, langs=langs)
+    >>> outputs = model(input_ids, langs=langs)


-The example `run_generation.py <https://github.com/huggingface/transformers/blob/master/examples/run_generation.py>`__
+The example `run_generation.py <https://github.com/huggingface/transformers/blob/master/examples/text-generation/run_generation.py>`__
 can generate text using the CLM checkpoints from XLM, using the language embeddings.

 XLM without Language Embeddings
@@ -104,4 +102,16 @@ BERT has two checkpoints that can be used for multi-lingual tasks:
 - ``bert-base-multilingual-cased`` (Masked language modeling + Next sentence prediction, 104 languages)

 These checkpoints do not require language embeddings at inference time. They should identify the language
-used in the context and infer accordingly.
+used in the context and infer accordingly.
+
+XLM-RoBERTa
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+XLM-RoBERTa was trained on 2.5TB of newly created clean CommonCrawl data in 100 languages. It provides strong
+gains over previously released multi-lingual models like mBERT or XLM on downstream taks like classification,
+sequence labeling and question answering.
+
+Two XLM-RoBERTa checkpoints can be used for multi-lingual tasks:
+
+- ``xlm-roberta-base`` (Masked language modeling, 100 languages)
+- ``xlm-roberta-large`` (Masked language modeling, 100 languages)
--- a/docs/source/notebooks.md
+++ b/docs/source/notebooks.md
@@ -0,0 +1 @@
+../../notebooks/README.md
--- a/docs/source/notebooks.rst
+++ b/docs/source/notebooks.rst
@@ -1,16 +0,0 @@
-Notebooks
-================================================
-
-We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
-
-
-*
-  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
-
-*
-  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
-
-*
-  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
-
-Please follow the instructions given in the notebooks to run and modify them.
--- a/docs/source/philosophy.rst
+++ b/docs/source/philosophy.rst
@@ -0,0 +1,73 @@
+Philosophy
+==========
+
+🤗 Transformers is an opinionated library built for:
+
+- NLP researchers and educators seeking to use/study/extend large-scale transformers models
+- hands-on practitioners who want to fine-tune those models and/or serve them in production
+- engineers who just want to download a pretrained model and use it to solve a given NLP task.
+
+The library was designed with two strong goals in mind:
+
+- Be as easy and fast to use as possible:
+
+    - We strongly limited the number of user-facing abstractions to learn, in fact, there are almost no abstractions,
+      just three standard classes required to use each model: :doc:`configuration <main_classes/configuration>`, 
+      :doc:`models <main_classes/model>` and :doc:`tokenizer <main_classes/tokenizer>`.
+    - All of these classes can be initialized in a simple and unified way from pretrained instances by using a common
+      :obj:`from_pretrained()` instantiation method which will take care of downloading (if needed), caching and
+      loading the related class instance and associated data (configurations' hyper-parameters, tokenizers' vocabulary, 
+      and models' weights) from a pretrained checkpoint provided on 
+      `Hugging Face Hub <https://huggingface.co/models>`__ or your own saved checkpoint.
+    - On top of those three base classes, the library provides two APIs: :func:`~transformers.pipeline` for quickly
+      using a model (plus its associated tokenizer and configuration) on a given task and 
+      :func:`~transformers.Trainer`/:func:`~transformers.TFTrainer` to quickly train or fine-tune a given model.
+    - As a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to
+      extend/build-upon the library, just use regular Python/PyTorch/TensorFlow/Keras modules and inherit from the base
+      classes of the library to reuse functionalities like model loading/saving.
+
+- Provide state-of-the-art models with performances as close as possible to the original models:
+
+    - We provide at least one example for each architecture which reproduces a result provided by the official authors
+      of said architecture.
+    - The code is usually as close to the original code base as possible which means some PyTorch code may be not as
+      *pytorchic* as it could be as a result of being converted TensorFlow code and vice versa.
+
+A few other goals:
+
+- Expose the models' internals as consistently as possible:
+
+    - We give access, using a single API, to the full hidden-states and attention weights.
+    - Tokenizer and base model's API are standardized to easily switch between models.
+
+- Incorporate a subjective selection of promising tools for fine-tuning/investigating these models:
+
+    - A simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning.
+    - Simple ways to mask and prune transformer heads.
+
+- Switch easily between PyTorch and TensorFlow 2.0, allowing training using one framwork and inference using another.
+
+Main concepts
+~~~~~~~~~~~~~
+
+The library is build around three types of classes for each model:
+
+- **Model classes**  such as :class:`~transformers.BertModel`, which are 30+ PyTorch models 
+  (`torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__) or Keras models 
+  (`tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__) that work with the pretrained
+  weights provided in the library.
+- **Configuration classes** such as :class:`~transformers.BertConfig`, which store all the parameters required to build
+  a model. You don't always need to instantiate these yourself. In particular, if you are using a pretrained model
+  without any modification, creating the model will automatically take care of instantiating the configuration (which
+  is part of the model).
+- **Tokenizer classes** such as :class:`~transformers.BertTokenizer`, which store the vocabulary for each model and
+  provide methods for encoding/decoding strings in a list of token embeddings indices to be fed to a model.
+
+All these classes can be instantiated from pretrained instances and saved locally using two methods:
+
+- :obj:`from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either
+  provided by the library itself (the suported models are provided in the list :doc:`here <pretrained_models>`
+  or stored locally (or on a server) by the user,
+- :obj:`save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using
+  :obj:`from_pretrained()`.
+
--- a/docs/source/preprocessing.rst
+++ b/docs/source/preprocessing.rst
@@ -0,0 +1,373 @@
+Preprocessing data
+==================
+
+In this tutorial, we'll explore how to preprocess your data using 🤗 Transformers. The main tool for this is what we
+
+call a :doc:`tokenizer <main_classes/tokenizer>`. You can build one using the tokenizer class associated to the model
+you would like to use, or directly with the :class:`~transformers.AutoTokenizer` class.
+
+As we saw in the :doc:`quicktour </quicktour>`, the tokenizer will first split a given text in words (or part of words,
+punctuation symbols, etc.) usually called `tokens`. Then it will convert those `tokens` into numbers, to be able to
+build a tensor out of them and feed them to the model. It will also add any additional inputs the model might expect to
+work properly.
+
+.. note::
+
+    If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer: it will split
+    the text you give it in tokens the same way for the pretraining corpus, and it will use the same correspondence
+    token to index (that we usually call a `vocab`) as during pretraining.
+
+To automatically download the vocab used during pretraining or fine-tuning a given model, you can use the 
+:func:`~transformers.AutoTokenizer.from_pretrained` method:
+
+::
+
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
+
+Base use
+~~~~~~~~
+
+A :class:`~transformers.PreTrainedTokenizer` has many methods, but the only one you need to remember for preprocessing
+is its ``__call__``: you just need to feed your sentence to your tokenizer object.
+
+::
+
+    encoded_input = tokenizer("Hello, I'm a single sentence!")
+    print(encoded_input)
+
+This will return a dictionary string to list of ints like this one:
+
+::
+
+    {'input_ids': [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102], 
+     'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+     'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+
+The `input_ids <glossary.html#input-ids>`__ are the indices corresponding to each token in our sentence. We will see
+below what the `attention_mask <glossary.html#attention-mask>`__ is used for and in
+:ref:`the next section <sentence-pairs>` the goal of `token_type_ids <glossary.html#token-type-ids>`__.
+
+The tokenizer can decode a list of token ids in a proper sentence:
+
+::
+
+    tokenizer.decode(encoded_input["input_ids"])
+
+which should return
+
+::
+
+    "[CLS] Hello, I'm a single sentence! [SEP]"
+
+As you can see, the tokenizer automatically added some special tokens that the model expect. Not all model need special
+tokens; for instance, if we had used` gtp2-medium` instead of `bert-base-cased` to create our tokenizer, we would have
+
+seen the same sentence as the original one here. You can disable this behavior (which is only advised if you have added
+those special tokens yourself) by passing ``add_special_tokens=False``.
+
+If you have several sentences you want to process, you can do this efficiently by sending them as a list to the
+tokenizer:
+
+::
+
+    batch_sentences = ["Hello I'm a single sentence",
+                       "And another sentence",
+                       "And the very very last one"]
+    encoded_inputs = tokenizer(batch_sentences)
+    print(encoded_inputs)
+
+We get back a dictionary once again, this time with values being list of list of ints:
+
+::
+
+    {'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102],
+                   [101, 1262, 1330, 5650, 102],
+                   [101, 1262, 1103, 1304, 1304, 1314, 1141, 102]],
+     'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 0, 0, 0, 0],
+                        [0, 0, 0, 0, 0, 0, 0, 0]],
+     'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1],
+                        [1, 1, 1, 1, 1],
+                        [1, 1, 1, 1, 1, 1, 1, 1]]}
+
+If the purpose of sending several sentences at a time to the tokenizer is to build a batch to feed the model, you will
+probably want:
+
+- To pad each sentence to the maximum length there is in your batch.
+- To truncate each sentence to the maximum length the model can accept (if applicable).
+- To return tensors.
+
+You can do all of this by using the following options when feeding your list of sentences to the tokenizer:
+
+::
+
+    ## PYTORCH CODE
+    batch = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
+    print(batch)
+    ## TENSORFLOW CODE
+    batch = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
+    print(batch)
+
+which should now return a dictionary string to tensor like this:
+
+::
+
+    {'input_ids': tensor([[ 101, 8667,  146,  112,  182,  170, 1423, 5650,  102],
+                          [ 101, 1262, 1330, 5650,  102,    0,    0,    0,    0],
+                          [ 101, 1262, 1103, 1304, 1304, 1314, 1141,  102,    0]]),
+     'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
+                               [0, 0, 0, 0, 0, 0, 0, 0, 0],
+                               [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 
+     'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
+                               [1, 1, 1, 1, 1, 0, 0, 0, 0],
+                               [1, 1, 1, 1, 1, 1, 1, 1, 0]])}
+
+We can now see what the `attention_mask <glossary.html#attention-mask>`__ is all about: it points out which tokens the
+model should pay attention to and which ones it should not (because they represent padding in this case).
+
+
+Note that if your model does not have a maximum length associated to it, the command above will throw a warning. You
+can safely ignore it. You can also pass ``verbose=False`` to stop the tokenizer to throw those kinds of warnings.
+
+.. _sentence-pairs:
+
+Preprocessing pairs of sentences
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Sometimes you need to feed pair of sentences to your model. For instance, if you want to classify if two sentences in a
+pair are similar, or for question-answering models, which take a context and a question. For BERT models, the input is
+then represented like this:
+
+::
+
+    [CLS] Sequence A [SEP] Sequence B [SEP]
+
+You can encode a pair of sentences in the format expected by your model by supplying the two sentences as two arguments
+
+(not a list since a list of two sentences will be interpreted as a batch of two single sentences, as we saw before).
+
+
+::
+
+    encoded_input = tokenizer("How old are you?", "I'm 6 years old")
+    print(encoded_input)
+
+This will once again return a dict string to list of ints:
+
+::
+
+    {'input_ids': [101, 1731, 1385, 1132, 1128, 136, 102, 146, 112, 182, 127, 1201, 1385, 102], 
+     'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 
+     'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+
+This shows us what the `token_type_ids <glossary.html#token-type-ids>`__ are for: they indicate to the model which part
+of the inputs correspond to the first sentence and which part corresponds to the second sentence. Note that
+`token_type_ids` are not required or handled by all models. By default, a tokenizer will only return the inputs that
+its associated model expects. You can force the return (or the non-return) of any of those special arguments by
+using ``return_input_ids`` or ``return_token_type_ids``.
+
+If we decode the token ids we obtained, we will see that the special tokens have been properly added.
+
+::
+
+    tokenizer.decode(encoded_input["input_ids"])
+
+will return:
+
+::
+
+    "[CLS] How old are you? [SEP] I'm 6 years old [SEP]"
+
+If you have a list of pairs of sequences you want to process, you should feed them as two lists to your tokenizer: the
+list of first sentences and the list of second sentences:
+
+::
+
+    batch_sentences = ["Hello I'm a single sentence",
+                       "And another sentence",
+                       "And the very very last one"]
+    batch_of_second_sentences = ["I'm a sentence that goes with the first sentence",
+                                 "And I should be encoded with the second sentence",
+                                 "And I go with the very last one"]
+    encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences)
+    print(encoded_inputs)
+
+will return a dict with the values being list of lists of ints:
+
+::
+
+    {'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102, 146, 112, 182, 170, 5650, 1115, 2947, 1114, 1103, 1148, 5650, 102], 
+                   [101, 1262, 1330, 5650, 102, 1262, 146, 1431, 1129, 12544, 1114, 1103, 1248, 5650, 102], 
+                   [101, 1262, 1103, 1304, 1304, 1314, 1141, 102, 1262, 146, 1301, 1114, 1103, 1304, 1314, 1141, 102]], 
+    'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                       [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 
+    'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
+
+To double-check what is fed to the model, we can decode each list in `input_ids` one by one:
+
+::
+
+    for ids in encoded_inputs["input_ids"]:
+        print(tokenizer.decode(ids))
+
+which will return:
+
+::
+
+    [CLS] Hello I'm a single sentence [SEP] I'm a sentence that goes with the first sentence [SEP]
+    [CLS] And another sentence [SEP] And I should be encoded with the second sentence [SEP]
+    [CLS] And the very very last one [SEP] And I go with the very last one [SEP]
+
+Once again, you can automatically pad your inputs to the maximum sentence length in the batch, truncate to the maximum
+length the model can accept and return tensors directly with the following:
+
+::
+
+    ## PYTORCH CODE
+    batch = tokenizer(batch_sentences, batch_of_second_sentences, padding=True, truncation=True, return_tensors="pt")
+    ## TENSORFLOW CODE
+    batch = tokenizer(batch_sentences, batch_of_second_sentences, padding=True, truncation=True, return_tensors="tf")
+
+Everything you always wanted to know about padding and truncation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We have seen the commands that will work for most cases (pad your batch to the length of the maximum sentence and
+
+truncate to the maximum length the mode can accept). However, the API supports more strategies if you need them. The
+three arguments you need to know for this are :obj:`padding`, :obj:`truncation` and :obj:`max_length`.
+
+- :obj:`padding` controls the padding. It can be a boolean or a string which should be:
+
+    - :obj:`True` or :obj:`'longest'` to pad to the longest sequence in the batch (doing no padding if you only provide
+      a single sequence).
+    - :obj:`'max_length'` to pad to a length specified by the :obj:`max_length` argument or the maximum length accepted
+      by the model if no :obj:`max_length` is provided (``max_length=None``). If you only provide a single sequence,
+      padding will still be applied to it. 
+    - :obj:`False` or :obj:`'do_not_pad'` to not pad the sequences. As we have seen before, this is the default
+      behavior.
+
+- :obj:`truncation` controls the truncation. It can be a boolean or a string which should be:
+
+    - :obj:`True` or :obj:`'only_first'` truncate to a maximum length specified by the :obj:`max_length` argument or
+      the maximum length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will
+      only truncate the first sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.
+    - :obj:`'only_second'` truncate to a maximum length specified by the :obj:`max_length` argument or the maximum
+      length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will only truncate
+      the second sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.
+    - :obj:`'longest_first'` truncate to a maximum length specified by the :obj:`max_length` argument or the maximum
+      length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will truncate token
+      by token, removing a token from the longest sequence in the pair until the proper length is reached.
+    - :obj:`False` or :obj:`'do_not_truncate'` to not truncate the sequences. As we have seen before, this is the
+      default behavior.
+
+- :obj:`max_length` to control the length of the padding/truncation. It can be an integer or :obj:`None`, in which case
+  it will default to the maximum length the model can accept. If the model has no specific maximum input length,
+  truncation/padding to :obj:`max_length` is deactivated.
+
+Here is a table summarizing the recommend way to setup padding and truncation. If you use pair of inputs sequence in
+any of the following examples, you can replace :obj:`truncation=True` by a :obj:`STRATEGY` selected in 
+:obj:`['only_first', 'only_second', 'longest_first']`, i.e. :obj:`truncation='only_second'` or
+:obj:`truncation= 'longest_first'` to control how both sequence in the pair are truncated as detailed before.
+
+--------------------------------------+-----------------------------------+---------------------------------------------------------------------------------------------+
+| Truncation                           | Padding                           | Instruction                                                                                 |
+======================================+===================================+=============================================================================================+
+| no truncation                        | no padding                        | :obj:`tokenizer(batch_sentences)`                                                           |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to max sequence in batch  | :obj:`tokenizer(batch_sentences, padding=True)` or                                          |
+|                                      |                                   | :obj:`tokenizer(batch_sentences, padding='longest')`                                        |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to max model input length | :obj:`tokenizer(batch_sentences, padding='max_length')`                                     |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to specific length        | :obj:`tokenizer(batch_sentences, padding='max_length', max_length=42)`                      |
+--------------------------------------+-----------------------------------+---------------------------------------------------------------------------------------------+
+| truncation to max model input length | no padding                        | :obj:`tokenizer(batch_sentences, truncation=True)` or                                       |
+|                                      |                                   | :obj:`tokenizer(batch_sentences, truncation=STRATEGY)`                                      |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to max sequence in batch  | :obj:`tokenizer(batch_sentences, padding=True, truncation=True)` or                         |
+|                                      |                                   | :obj:`tokenizer(batch_sentences, padding=True, truncation=STRATEGY)`                        |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to max model input length | :obj:`tokenizer(batch_sentences, padding='max_length', truncation=True)` or                 |
+|                                      |                                   | :obj:`tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY)`                |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to specific length        | Not possible                                                                                |
+--------------------------------------+-----------------------------------+---------------------------------------------------------------------------------------------+
+| truncation to specific length        | no padding                        | :obj:`tokenizer(batch_sentences, truncation=True, max_length=42)` or                        |
+|                                      |                                   | :obj:`tokenizer(batch_sentences, truncation=STRATEGY, max_length=42)`                       |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to max sequence in batch  | :obj:`tokenizer(batch_sentences, padding=True, truncation=True, max_length=42)` or          |
+|                                      |                                   | :obj:`tokenizer(batch_sentences, padding=True, truncation=STRATEGY, max_length=42)`         |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to max model input length | Not possible                                                                                |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to specific length        | :obj:`tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42)` or  |
+|                                      |                                   | :obj:`tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY, max_length=42)` |
+--------------------------------------+-----------------------------------+---------------------------------------------------------------------------------------------+
+
+Pre-tokenized inputs
+~~~~~~~~~~~~~~~~~~~~
+
+The tokenizer also accept pre-tokenized inputs. This is particularly useful when you want to compute labels and extract
+predictions in `named entity recognition (NER) <https://en.wikipedia.org/wiki/Named-entity_recognition>`__ or
+`part-of-speech tagging (POS tagging) <https://en.wikipedia.org/wiki/Part-of-speech_tagging>`__.
+
+If you want to use pre-tokenized inputs, just set :obj:`is_pretokenized=True` when passing your inputs to the
+tokenizer. For instance:
+
+::
+
+    encoded_input = tokenizer(["Hello", "I'm", "a", "single", "sentence"], is_pretokenized=True)
+    print(encoded_input)
+
+will return:
+
+::
+
+    {'input_ids': [101, 8667, 146, 112, 182, 170, 1423, 5650, 102],
+     'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 
+     'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
+
+Note that the tokenizer still adds the ids of special tokens (if applicable) unless you pass
+``add_special_tokens=False``.
+
+This works exactly as before for batch of sentences or batch of pairs of sentences. You can encode a batch of sentences
+like this:
+
+::
+
+    batch_sentences = [["Hello", "I'm", "a", "single", "sentence"],
+                       ["And", "another", "sentence"],
+                       ["And", "the", "very", "very", "last", "one"]]
+    encoded_inputs = tokenizer(batch_sentences, is_pretokenized=True)
+
+or a batch of pair sentences like this:
+
+::
+
+    batch_of_second_sentences = [["I'm", "a", "sentence", "that", "goes", "with", "the", "first", "sentence"],
+                                 ["And", "I", "should", "be", "encoded", "with", "the", "second", "sentence"],
+                                 ["And", "I", "go", "with", "the", "very", "last", "one"]]
+    encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences, is_pretokenized=True)
+
+And you can add padding, truncation as well as directly return tensors like before:
+
+::
+
+    ## PYTORCH CODE
+    batch = tokenizer(batch_sentences,
+                      batch_of_second_sentences,
+                      is_pretokenized=True,
+                      padding=True,
+                      truncation=True,
+                      return_tensors="pt")
+    ## TENSORFLOW CODE
+    batch = tokenizer(batch_sentences,
+                      batch_of_second_sentences,
+                      is_pretokenized=True,
+                      padding=True,
+                      truncation=True,
+                      return_tensors="tf")
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -22,10 +22,12 @@ For a list that includes community-uploaded models, refer to `https://huggingfac
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-multilingual-uncased``                         | | (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters.                                                        |
 |                   |                                                            | | Trained on lower-cased text in the top 102 languages with the largest Wikipedias                                                    |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__).                                              |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-multilingual-cased``                           | | (New, **recommended**) 12-layer, 768-hidden, 12-heads, 110M parameters.                                                             |
 |                   |                                                            | | Trained on cased text in the top 104 languages with the largest Wikipedias                                                          |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__).                                              |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-chinese``                                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
@@ -33,64 +35,79 @@ For a list that includes community-uploaded models, refer to `https://huggingfac
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-german-cased``                                 | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on cased German text by Deepset.ai                                                                                          |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details on deepset.ai website <https://deepset.ai/german-bert>`__).                                                             |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-uncased-whole-word-masking``                  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
 |                   |                                                            | | Trained on lower-cased English text using Whole-Word-Masking                                                                        |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__).                                                                    |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-cased-whole-word-masking``                    | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
 |                   |                                                            | | Trained on cased English text using Whole-Word-Masking                                                                              |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__).                                                                    |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
 |                   |                                                            | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                                                             |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/transformers/tree/master/examples>`__).           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-cased-whole-word-masking-finetuned-squad``    | | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                                    |
 |                   |                                                            | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                               |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-cased-finetuned-mrpc``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | The ``bert-base-cased`` model fine-tuned on MRPC                                                                                    |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-german-dbmdz-cased``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on cased German text by DBMDZ                                                                                               |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-german-dbmdz-uncased``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on uncased German text by DBMDZ                                                                                             |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-japanese``                                     | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``cl-tohoku/bert-base-japanese``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece.                                                               |
 |                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-japanese-whole-word-masking``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``cl-tohoku/bert-base-japanese-whole-word-masking``        | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece.                                      |
 |                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-japanese-char``                                | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``cl-tohoku/bert-base-japanese-char``                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text. Text is tokenized into characters.                                                                        |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-japanese-char-whole-word-masking``             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``cl-tohoku/bert-base-japanese-char-whole-word-masking``   | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                               |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-finnish-cased-v1``                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``TurkuNLP/bert-base-finnish-cased-v1``                    | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on cased Finnish text.                                                                                                      |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-finnish-uncased-v1``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``TurkuNLP/bert-base-finnish-uncased-v1``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on uncased Finnish text.                                                                                                    |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-dutch-cased``                                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``wietsedv/bert-base-dutch-cased``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on cased Dutch text.                                                                                                        |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details on wietsedv repository <https://github.com/wietsedv/bertje/>`__).                                                       |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
@@ -149,54 +166,67 @@ For a list that includes community-uploaded models, refer to `https://huggingfac
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | RoBERTa           | ``roberta-base``                                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
 |                   |                                                            | | RoBERTa using the BERT-base architecture                                                                                            |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``roberta-large``                                          | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
 |                   |                                                            | | RoBERTa using the BERT-large architecture                                                                                           |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``roberta-large-mnli``                                     | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
 |                   |                                                            | | ``roberta-large`` fine-tuned on `MNLI <http://www.nyu.edu/projects/bowman/multinli/>`__.                                            |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilroberta-base``                                     | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
 |                   |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``roberta-base-openai-detector``                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
 |                   |                                                            | | ``roberta-base`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model.                                             |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__)                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``roberta-large-openai-detector``                          | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
 |                   |                                                            | | ``roberta-large`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model.                                            |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__)                                               |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | DistilBERT        | ``distilbert-base-uncased``                                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint                                                   |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilbert-base-uncased-distilled-squad``                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer.                 |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilbert-base-cased``                                  | | 6-layer, 768-hidden, 12-heads, 65M parameters                                                                                       |
 |                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint                                                     |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilbert-base-cased-distilled-squad``                  | | 6-layer, 768-hidden, 12-heads, 65M parameters                                                                                       |
 |                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint, with an additional question answering layer.       |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilgpt2``                                             | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
 |                   |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilbert-base-german-cased``                           | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                   |                                                            | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint.                   |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilbert-base-multilingual-cased``                     | | 6-layer, 768-hidden, 12-heads, 134M parameters                                                                                      |
 |                   |                                                            | | The multilingual DistilBERT model distilled from the Multilingual BERT model `bert-base-multilingual-cased` checkpoint.             |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | CTRL              | ``ctrl``                                                   | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters                                                                                    |
@@ -204,38 +234,47 @@ For a list that includes community-uploaded models, refer to `https://huggingfac
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | CamemBERT         | ``camembert-base``                                         | | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                                     |
 |                   |                                                            | | CamemBERT using the BERT-base architecture                                                                                          |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/camembert>`__)                                                 |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | ALBERT            | ``albert-base-v1``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
 |                   |                                                            | | ALBERT base model                                                                                                                   |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-large-v1``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
 |                   |                                                            | | ALBERT large model                                                                                                                  |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xlarge-v1``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
 |                   |                                                            | | ALBERT xlarge model                                                                                                                 |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xxlarge-v1``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
 |                   |                                                            | | ALBERT xxlarge model                                                                                                                |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-base-v2``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
 |                   |                                                            | | ALBERT base model with no dropout, additional training data and longer training                                                     |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-large-v2``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
 |                   |                                                            | | ALBERT large model with no dropout, additional training data and longer training                                                    |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xlarge-v2``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
 |                   |                                                            | | ALBERT xlarge model with no dropout, additional training data and longer training                                                   |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xxlarge-v2``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
 |                   |                                                            | | ALBERT xxlarge model with no dropout, additional training data and longer training                                                  |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | T5                | ``t5-small``                                               | | ~60M parameters with 6-layers, 512-hidden-state, 2048 feed-forward hidden-state, 8-heads,                                           |
@@ -259,28 +298,62 @@ For a list that includes community-uploaded models, refer to `https://huggingfac
 |                   | ``xlm-roberta-large``                                      | | ~355M parameters with 24-layers, 1027-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
 |                   |                                                            | | Trained on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                          |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| FlauBERT          | ``flaubert-small-cased``                                   | | 6-layer, 512-hidden, 8-heads, 54M parameters                                                                                        |
+| FlauBERT          | ``flaubert/flaubert_small_cased``                          | | 6-layer, 512-hidden, 8-heads, 54M parameters                                                                                        |
 |                   |                                                            | | FlauBERT small architecture                                                                                                         |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``flaubert-base-uncased``                                  | | 12-layer, 768-hidden, 12-heads, 137M parameters                                                                                     |
+|                   | ``flaubert/flaubert_base_uncased``                         | | 12-layer, 768-hidden, 12-heads, 137M parameters                                                                                     |
 |                   |                                                            | | FlauBERT base architecture with uncased vocabulary                                                                                  |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``flaubert-base-cased``                                    | | 12-layer, 768-hidden, 12-heads, 138M parameters                                                                                     |
+|                   | ``flaubert/flaubert_base_cased``                           | | 12-layer, 768-hidden, 12-heads, 138M parameters                                                                                     |
 |                   |                                                            | | FlauBERT base architecture with cased vocabulary                                                                                    |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``flaubert-large-cased``                                   | | 24-layer, 1024-hidden, 16-heads, 373M parameters                                                                                    |
+|                   | ``flaubert/flaubert_large_cased``                          | | 24-layer, 1024-hidden, 16-heads, 373M parameters                                                                                    |
 |                   |                                                            | | FlauBERT large architecture                                                                                                         |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Bart              | ``bart-large``                                             | | 12-layer, 1024-hidden, 16-heads, 406M parameters                                                                                    |
+| Bart              | ``facebook/bart-large``                                    | | 24-layer, 1024-hidden, 16-heads, 406M parameters                                                                                    |
+|                   |                                                            |                                                                                                                                       |
 |                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_)                                                       |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bart-large-mnli``                                        | | Adds a 2 layer classification head with 1 million parameters                                                                        |
+|                   | ``facebook/bart-base``                                     | | 12-layer, 768-hidden, 16-heads, 139M parameters                                                                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``facebook/bart-large-mnli``                               | | Adds a 2 layer classification head with 1 million parameters                                                                        |
 |                   |                                                            | | bart-large base architecture with a classification head, finetuned on MNLI                                                          |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bart-large-cnn``                                         | | 12-layer, 1024-hidden, 16-heads, 406M parameters       (same as base)                                                               |
+|                   | ``facebook/bart-large-cnn``                                | | 12-layer, 1024-hidden, 16-heads, 406M parameters       (same as base)                                                               |
 |                   |                                                            | | bart-large base architecture finetuned on cnn summarization task                                                                    |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``facebook/mbart-large-en-ro``                             | | 12-layer, 1024-hidden, 16-heads, 880M parameters                                                                                    |
+|                   |                                                            | | bart-large architecture pretrained on cc25 multilingual data , finetuned on WMT english romanian translation.                       |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| DialoGPT          | ``DialoGPT-small``                                         | | 12-layer, 768-hidden, 12-heads, 124M parameters                                                                                     |
+|                   |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``DialoGPT-medium``                                        | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
+|                   |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``DialoGPT-large``                                         | | 36-layer, 1280-hidden, 20-heads, 774M parameters                                                                                    |
+|                   |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| Reformer          | ``reformer-enwik8``                                        | | 12-layer, 1024-hidden, 8-heads, 149M parameters                                                                                     |
+|                   |                                                            | | Trained on English Wikipedia data - enwik8.                                                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``reformer-crime-and-punishment``                          | | 6-layer, 256-hidden, 2-heads, 3M parameters                                                                                         |
+|                   |                                                            | | Trained on English text: Crime and Punishment novel by Fyodor Dostoyevsky.                                                          |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| MarianMT          | ``Helsinki-NLP/opus-mt-{src}-{tgt}``                       | | 12-layer, 512-hidden, 8-heads, ~74M parameter Machine translation models. Parameter counts vary depending on vocab size.            |
+|                   |                                                            | | (see `model list <https://huggingface.co/Helsinki-NLP>`_)                                                                           |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| Longformer        | ``allenai/longformer-base-4096``                           | | 12-layer, 768-hidden, 12-heads, ~149M parameters                                                                                    |
+|                   |                                                            | | Starting from RoBERTa-base checkpoint, trained on documents of max length 4,096                                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``allenai/longformer-large-4096``                          | | 24-layer, 1024-hidden, 16-heads, ~435M parameters                                                                                   |
+|                   |                                                            | | Starting from RoBERTa-large checkpoint, trained on documents of max length 4,096                                                    |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -1,222 +0,0 @@
-# Quickstart
-
-## Philosophy
-
-Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
-
-The library was designed with two strong goals in mind:
-
- be as easy and fast to use as possible:
-
-  - we strongly limited the number of user-facing abstractions to learn, in fact there are almost no abstractions, just three standard classes required to use each model: configuration, models and tokenizer,
-  - all of these classes can be initialized in a simple and unified way from pretrained instances by using a common `from_pretrained()` instantiation method which will take care of downloading (if needed), caching and loading the related class from a pretrained instance supplied in the library or your own saved instance.
-  - as a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to extend/build-upon the library, just use regular Python/PyTorch modules and inherit from the base classes of the library to reuse functionalities like model loading/saving.
-
- provide state-of-the-art models with performances as close as possible to the original models:
-
-  - we provide at least one example for each architecture which reproduces a result provided by the official authors of said architecture,
-  - the code is usually as close to the original code base as possible which means some PyTorch code may be not as *pytorchic* as it could be as a result of being converted TensorFlow code.
-
-A few other goals:
-
- expose the models' internals as consistently as possible:
-
-  - we give access, using a single API to the full hidden-states and attention weights,
-  - tokenizer and base model's API are standardized to easily switch between models.
-
- incorporate a subjective selection of promising tools for fine-tuning/investigating these models:
-
-  - a simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning,
-  - simple ways to mask and prune transformer heads.
-
-## Main concepts
-
-The library is build around three type of classes for each models:
-
- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 8 models architectures currently provided in the library, e.g. `BertModel`
- **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig`. You don't always need to instantiate these your-self, in particular if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model)
- **tokenizer classes** which store the vocabulary for each model and provide methods for encoding/decoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer`
-
-All these classes can be instantiated from pretrained instances and saved locally using two methods:
-
- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
- `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.
-
-We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts:
-
- the **MAIN CLASSES** section details the common functionalities/method/attributes of the three main type of classes (configuration, model, tokenizer) plus some optimization related classes provided as utilities for training,
- the **PACKAGE REFERENCE** section details all the variants of each class for each model architectures and in particular the input/output that you should expect when calling each of them.
-
-## Quick tour: Usage
-
-Here are two examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.
-
-See full API reference for examples for each model class.
-
-### BERT example
-
-Let's start by preparing a tokenized input (a list of token embeddings indices to be fed to Bert) from a text string using `BertTokenizer`
-
-```python
-import torch
-from transformers import BertTokenizer, BertModel, BertForMaskedLM
-
-# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
-import logging
-logging.basicConfig(level=logging.INFO)
-
-# Load pre-trained model tokenizer (vocabulary)
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-
-# Tokenize input
-text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-tokenized_text = tokenizer.tokenize(text)
-
-# Mask a token that we will try to predict back with `BertForMaskedLM`
-masked_index = 8
-tokenized_text[masked_index] = '[MASK]'
-assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
-
-# Convert token to vocabulary indices
-indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
-segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
-
-# Convert inputs to PyTorch tensors
-tokens_tensor = torch.tensor([indexed_tokens])
-segments_tensors = torch.tensor([segments_ids])
-```
-
-Let's see how we can use `BertModel` to encode our inputs in hidden-states:
-
-```python
-# Load pre-trained model (weights)
-model = BertModel.from_pretrained('bert-base-uncased')
-
-# Set the model in evaluation mode to deactivate the DropOut modules
-# This is IMPORTANT to have reproducible results during evaluation!
-model.eval()
-
-# If you have a GPU, put everything on cuda
-tokens_tensor = tokens_tensor.to('cuda')
-segments_tensors = segments_tensors.to('cuda')
-model.to('cuda')
-
-# Predict hidden states features for each layer
-with torch.no_grad():
-    # See the models docstrings for the detail of the inputs
-    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
-    # Transformers models always output tuples.
-    # See the models docstrings for the detail of all the outputs
-    # In our case, the first element is the hidden state of the last layer of the Bert model
-    encoded_layers = outputs[0]
-# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
-assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)
-```
-
-And how to use `BertForMaskedLM` to predict a masked token:
-
-```python
-# Load pre-trained model (weights)
-model = BertForMaskedLM.from_pretrained('bert-base-uncased')
-model.eval()
-
-# If you have a GPU, put everything on cuda
-tokens_tensor = tokens_tensor.to('cuda')
-segments_tensors = segments_tensors.to('cuda')
-model.to('cuda')
-
-# Predict all tokens
-with torch.no_grad():
-    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
-    predictions = outputs[0]
-
-# confirm we were able to predict 'henson'
-predicted_index = torch.argmax(predictions[0, masked_index]).item()
-predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-assert predicted_token == 'henson'
-```
-
-### OpenAI GPT-2
-
-Here is a quick-start example using `GPT2Tokenizer` and `GPT2LMHeadModel` class with OpenAI's pre-trained model to predict the next token from a text prompt.
-
-First let's prepare a tokenized input from our text string using `GPT2Tokenizer`
-
-```python
-import torch
-from transformers import GPT2Tokenizer, GPT2LMHeadModel
-
-# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
-import logging
-logging.basicConfig(level=logging.INFO)
-
-# Load pre-trained model tokenizer (vocabulary)
-tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-
-# Encode a text inputs
-text = "Who was Jim Henson ? Jim Henson was a"
-indexed_tokens = tokenizer.encode(text)
-
-# Convert indexed tokens in a PyTorch tensor
-tokens_tensor = torch.tensor([indexed_tokens])
-```
-
-Let's see how to use `GPT2LMHeadModel` to generate the next token following our text:
-
-```python
-# Load pre-trained model (weights)
-model = GPT2LMHeadModel.from_pretrained('gpt2')
-
-# Set the model in evaluation mode to deactivate the DropOut modules
-# This is IMPORTANT to have reproducible results during evaluation!
-model.eval()
-
-# If you have a GPU, put everything on cuda
-tokens_tensor = tokens_tensor.to('cuda')
-model.to('cuda')
-
-# Predict all tokens
-with torch.no_grad():
-    outputs = model(tokens_tensor)
-    predictions = outputs[0]
-
-# get the predicted next sub-word (in our case, the word 'man')
-predicted_index = torch.argmax(predictions[0, -1, :]).item()
-predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
-assert predicted_text == 'Who was Jim Henson? Jim Henson was a man'
-```
-
-Examples for each model class of each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [documentation](#documentation).
-
-#### Using the past
-
-GPT-2 as well as some other models (GPT, XLNet, Transfo-XL, CTRL) make use of a `past` or `mems` attribute which can be used to prevent re-computing the key/value pairs when using sequential decoding. It is useful when generating sequences as a big part of the attention mechanism benefits from previous computations.
-
-Here is a fully-working example using the `past` with `GPT2LMHeadModel` and argmax decoding (which should only be used as an example, as argmax decoding introduces a lot of repetition):
-
-```python
-from transformers import GPT2LMHeadModel, GPT2Tokenizer
-import torch
-
-tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-model = GPT2LMHeadModel.from_pretrained('gpt2')
-
-generated = tokenizer.encode("The Manhattan bridge")
-context = torch.tensor([generated])
-past = None
-
-for i in range(100):
-    print(i)
-    output, past = model(context, past=past)
-    token = torch.argmax(output[..., -1, :])
-
-    generated += [token.tolist()]
-    context = token.unsqueeze(0)
-
-sequence = tokenizer.decode(generated)
-
-print(sequence)
-```
-
-The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
--- a/docs/source/quicktour.rst
+++ b/docs/source/quicktour.rst
@@ -0,0 +1,393 @@
+Quick tour
+==========
+
+Let's have a quick look at the 🤗 Transformers library features. The library downloads pretrained models for
+Natural Language Understanding (NLU) tasks, such as analyzing the sentiment of a text, and Natural Language Generation (NLG),
+such as completing a prompt with new text or translating in another language.
+
+First we will see how to easily leverage the pipeline API to quickly use those pretrained models at inference. Then, we
+will dig a little bit more and see how the library gives you access to those models and helps you preprocess your data.
+
+.. note::
+
+    All code examples presented in the documentation have a switch on the top left for Pytorch versus TensorFlow. If
+    not, the code is expected to work for both backends without any change needed.
+
+Getting started on a task with a pipeline
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The easiest way to use a pretrained model on a given task is to use :func:`~transformers.pipeline`. 🤗 Transformers
+provides the following tasks out of the box:
+
+- Sentiment analysis: is a text positive or negative?
+- Text generation (in English): provide a prompt and the model will generate what follows.
+- Name entity recognition (NER): in an input sentence, label each word with the entity it represents (person, place,
+  etc.)
+- Question answering: provide the model with some context and a question, extract the answer from the context.
+- Filling masked text: given a text with masked words (e.g., replaced by ``[MASK]``), fill the blanks.
+- Summarization: generate a summary of a long text.
+- Translation: translate a text in another language.
+- Feature extraction: return a tensor representation of the text.
+
+Let's see how this work for sentiment analysis (the other tasks are all covered in the
+:doc:`task summary </task_summary>`):
+
+.. code-block::
+
+    >>> from transformers import pipeline
+    >>> classifier = pipeline('sentiment-analysis')
+
+When typing this command for the first time, a pretrained model and its tokenizer are downloaded and cached. We will
+look at both later on, but as an introduction the tokenizer's job is to preprocess the text for the model, which is
+then responsible for making predictions. The pipeline groups all of that together, and post-process the predictions to
+make them readable. For instance:
+
+
+.. code-block::
+
+    >>> classifier('We are very happy to show you the 🤗 Transformers library.')
+    [{'label': 'POSITIVE', 'score': 0.9997795224189758}]
+
+That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model as a
+`batch`, returning a list of dictionaries like this one:
+
+.. code-block::
+
+    >>> results = classifier(["We are very happy to show you the 🤗 Transformers library.",
+    ...            "We hope you don't hate it."])
+    >>> for result in results:
+    ...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+    label: POSITIVE, with score: 0.9998
+    label: NEGATIVE, with score: 0.5309
+
+You can see the second sentence has been classified as negative (it needs to be positive or negative) but its score is
+fairly neutral.
+
+By default, the model downloaded for this pipeline is called "distilbert-base-uncased-finetuned-sst-2-english". We can
+look at its `model page <https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english>`__ to get more
+information about it. It uses the :doc:`DistilBERT architecture </model_doc/distilbert>` and has been fine-tuned on a
+dataset called SST-2 for the sentiment analysis task.
+
+Let's say we want to use another model; for instance, one that has been trained on French data. We can search through
+the `model hub <https://huggingface.co/models>`__ that gathers models pretrained on a lot of data by research labs, but
+also community models (usually fine-tuned versions of those big models on a specific dataset). Applying the tags
+"French" and "text-classification" gives back a suggestion "nlptown/bert-base-multilingual-uncased-sentiment". Let's
+see how we can use it.
+
+You can directly pass the name of the model to use to :func:`~transformers.pipeline`:
+
+.. code-block::
+
+    >>> classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")
+
+This classifier can now deal with texts in English, French, but also Dutch, German, Italian and Spanish! You can also
+replace that name by a local folder where you have saved a pretrained model (see below). You can also pass a model
+object and its associated tokenizer.
+
+We will need two classes for this. The first is :class:`~transformers.AutoTokenizer`, which we will use to download the
+tokenizer associated to the model we picked and instantiate it. The second is
+:class:`~transformers.AutoModelForSequenceClassification` (or
+:class:`~transformers.TFAutoModelForSequenceClassification` if you are using TensorFlow), which we will use to download
+the model itself. Note that if we were using the library on an other task, the class of the model would change. The
+:doc:`task summary </task_summary>` tutorial summarizes which class is used for which task.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+Now, to download the models and tokenizer we found previously, we just have to use the
+:func:`~transformers.AutoModelForSequenceClassification.from_pretrained` method (feel free to replace ``model_name`` by
+any other model from the model hub):
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+    >>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+    >>> pipe = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
+    >>> ## TENSORFLOW CODE
+    >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+    >>> # This model only exists in PyTorch, so we use the `from_pt` flag to import that model in TensorFlow.
+    >>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True) 
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+    >>> classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
+
+If you don't find a model that has been pretrained on some data similar to yours, you will need to fine-tune a
+pretrained model on your data. We provide :doc:`example scripts </examples>` to do so. Once you're done, don't forget
+to share your fine-tuned model on the hub with the community, using :doc:`this tutorial </model_sharing>`.
+
+.. _pretrained-model:
+
+Under the hood: pretrained models
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Let's now see what happens beneath the hood when using those pipelines. As we saw, the model and tokenizer are created
+using the :obj:`from_pretrained` method:
+
+::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    >>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+    >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    >>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+Using the tokenizer
+^^^^^^^^^^^^^^^^^^^
+
+We mentioned the tokenizer is responsible for the preprocessing of your texts. First, it will split a given text in
+words (or part of words, punctuation symbols, etc.) usually called `tokens`. There are multiple rules that can govern
+that process, which is why we need to instantiate the tokenizer using the name of the model, to make sure we use the
+same rules as when the model was pretrained.
+
+The second step is to convert those `tokens` into numbers, to be able to build a tensor out of them and feed them to
+the model. To do this, the tokenizer has a `vocab`, which is the part we download when we instantiate it with the
+:obj:`from_pretrained` method, since we need to use the same `vocab` as when the model was pretrained.
+
+To apply these steps on a given text, we can just feed it to our tokenizer:
+
+.. code-block::
+
+    >>> inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")
+
+This returns a dictionary string to list of ints. It contains the `ids of the tokens <glossary.html#input-ids>`__,
+as mentioned before, but also additional arguments that will be useful to the model. Here for instance, we also have an
+`attention mask <glossary.html#attention-mask>`__ that the model will use to have a better understanding of the sequence:
+
+
+.. code-block::
+
+    >>> print(inputs)
+    {'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+
+You can pass a list of sentences directly to your tokenizer. If your goal is to send them through your model as a
+batch, you probably want to pad them all to the same length, truncate them to the maximum length the model can accept
+and get tensors back. You can specify all of that to the tokenizer:
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> pt_batch = tokenizer(
+    ...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+    ...     padding=True,
+    ...     truncation=True,
+    ...     return_tensors="pt"
+    ... )
+    >>> ## TENSORFLOW CODE
+    >>> tf_batch = tokenizer(
+    ...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+    ...     padding=True,
+    ...     truncation=True,
+    ...     return_tensors="tf"
+    ... )
+
+The padding is automatically applied on the side the model expect it (in this case, on the right), with the
+padding token the model was pretrained with. The attention mask is also adapted to take the padding into account:
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> for key, value in pt_batch.items():
+    ...     print(f"{key}: {value.numpy().tolist()}")
+    input_ids: [[101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], [101, 2057, 3246, 2017, 2123, 1005, 1056, 5223, 2009, 1012, 102, 0, 0, 0]]
+    attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]
+    >>> ## TENSORFLOW CODE
+    >>> for key, value in tf_batch.items():
+    ...     print(f"{key}: {value.numpy().tolist()}")
+    input_ids: [[101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], [101, 2057, 3246, 2017, 2123, 1005, 1056, 5223, 2009, 1012, 102, 0, 0, 0]]
+    attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]
+
+You can learn more about tokenizers :doc:`here <preprocessing>`.
+
+Using the model
+^^^^^^^^^^^^^^^
+
+Once your input has been preprocessed by the tokenizer, you can directly send it to the model. As we mentioned, it will
+contain all the relevant information the model needs. If you're using a TensorFlow model, you can directly pass the
+dictionary keys to tensor, for a PyTorch model, you need to unpack the dictionary by adding :obj:`**`.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> pt_outputs = pt_model(**pt_batch)
+    >>> ## TENSORFLOW CODE
+    >>> tf_outputs = tf_model(tf_batch)
+
+In 🤗 Transformers, all outputs are tuples (with only one element potentially). Here, we get a tuple with just the
+final activations of the model.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> print(pt_outputs)
+    (tensor([[-4.0833,  4.3364],
+            [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>),)
+    >>> ## TENSORFLOW CODE
+    >>> print(tf_outputs)
+    (<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
+    array([[-4.0832963 ,  4.3364134 ],
+           [ 0.08181238, -0.04178794]], dtype=float32)>,)
+
+.. note::
+
+    All 🤗 Transformers models (PyTorch or TensorFlow) return the activations of the model *before* the final
+    activation function (like SoftMax) since this final activation function is often fused with the loss.
+
+Let's apply the SoftMax activation to get predictions.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> import torch.nn.functional as F
+    >>> pt_predictions = F.softmax(pt_outputs[0], dim=-1)
+    >>> ## TENSORFLOW CODE
+    >>> import tensorflow as tf
+    >>> tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
+
+We can see we get the numbers from before:
+
+.. code-block::
+
+    >>> ## TENSORFLOW CODE
+    >>> print(tf_predictions)
+    tf.Tensor(
+    [[2.2042994e-04 9.9977952e-01]
+     [5.3086078e-01 4.6913919e-01]], shape=(2, 2), dtype=float32)
+    >>> ## PYTORCH CODE
+    >>> print(pt_predictions)
+    tensor([[2.2043e-04, 9.9978e-01],
+            [5.3086e-01, 4.6914e-01]], grad_fn=<SoftmaxBackward>)
+
+If you have labels, you can provide them to the model, it will return a tuple with the loss and the final activations.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> import torch
+    >>> pt_outputs = pt_model(**pt_batch, labels = torch.tensor([1, 0]))
+    >>> ## TENSORFLOW CODE
+    >>> import tensorflow as tf
+    >>> tf_outputs = tf_model(tf_batch, labels = tf.constant([1, 0]))
+
+Models are standard `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ or
+`tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ so you can use them in your usual
+training loop. 🤗 Transformers also provides a :class:`~transformers.Trainer` (or :class:`~transformers.TFTrainer` if
+you are using TensorFlow) class to help with your training (taking care of things such as distributed training, mixed
+precision, etc.). See the :doc:`training tutorial <training>` for more details.
+
+Once your model is fine-tuned, you can save it with its tokenizer the following way:
+
+::
+
+    tokenizer.save_pretrained(save_directory)
+    model.save_pretrained(save_directory)
+
+You can then load this model back using the :func:`~transformers.AutoModel.from_pretrained` method by passing the
+directory name instead of the model name. One cool feature of 🤗 Transformers is that you can easily switch between
+PyTorch and TensorFlow: any model saved as before can be loaded back either in PyTorch or TensorFlow. If you are
+loading a saved PyTorch model in a TensorFlow model, use :func:`~transformers.TFAutoModel.from_pretrained` like this:
+
+::
+
+    tokenizer = AutoTokenizer.from_pretrained(save_directory)
+    model = TFAutoModel.from_pretrained(save_directory, from_pt=True)
+
+and if you are loading a saved TensorFlow model in a PyTorch model, you should use the following code:
+
+::
+
+    tokenizer = AutoTokenizer.from_pretrained(save_directory)
+    model = AutoModel.from_pretrained(save_directory, from_tf=True)
+
+Lastly, you can also ask the model to return all hidden states and all attention weights if you need them:
+
+
+::
+
+    >>> ## PYTORCH CODE
+    >>> pt_outputs = pt_model(**pt_batch, output_hidden_states=True, output_attentions=True)
+    >>> all_hidden_states, all_attentions = pt_outputs[-2:]
+    >>> ## TENSORFLOW CODE
+    >>> tf_outputs = tf_model(tf_batch, output_hidden_states=True, output_attentions=True)
+    >>> all_hidden_states, all_attentions = tf_outputs[-2:]
+
+Accessing the code
+^^^^^^^^^^^^^^^^^^
+
+The :obj:`AutoModel` and :obj:`AutoTokenizer` classes are just shortcuts that will automatically work with any
+pretrained model. Behind the scenes, the library has one model class per combination of architecture plus class, so the
+code is easy to access and tweak if you need to.
+
+In our previous example, the model was called "distilbert-base-uncased-finetuned-sst-2-english", which means it's
+using the :doc:`DistilBERT </model_doc/distilbert>` architecture. The model automatically created is then a
+:class:`~transformers.DistilBertForSequenceClassification`. You can look at its documentation for all details relevant
+to that specific model, or browse the source code. This is how you would directly instantiate model and tokenizer
+without the auto magic:
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
+    >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    >>> model = DistilBertForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
+    >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    >>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
+
+Customizing the model
+^^^^^^^^^^^^^^^^^^^^^
+
+If you want to change how the model itself is built, you can define your custom configuration class. Each architecture
+comes with its own relevant configuration (in the case of DistilBERT, :class:`~transformers.DistilBertConfig`) which
+allows you to specify any of the hidden dimension, dropout rate etc. If you do core modifications, like changing the
+hidden size, you won't be able to use a pretrained model anymore and will need to train from scratch. You would then
+instantiate the model directly from this configuration.
+
+Here we use the predefined vocabulary of DistilBERT (hence load the tokenizer with the
+:func:`~transformers.DistilBertTokenizer.from_pretrained` method) and initialize the model from scratch (hence
+instantiate the model from the configuration instead of using the
+:func:`~transformers.DistilBertForSequenceClassification.from_pretrained` method).
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
+    >>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+    >>> model = DistilBertForSequenceClassification(config)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
+    >>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+    >>> model = TFDistilBertForSequenceClassification(config)
+
+For something that only changes the head of the model (for instance, the number of labels), you can still use a
+pretrained model for the body. For instance, let's define a classifier for 10 different labels using a pretrained body.
+We could create a configuration with all the default values and just change the number of labels, but more easily, you
+can directly pass any argument a configuration would take to the :func:`from_pretrained` method and it will update the
+default configuration with it:
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
+    >>> model_name = "distilbert-base-uncased"
+    >>> model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
+    >>> model_name = "distilbert-base-uncased"
+    >>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -1,190 +0,0 @@
-Loading Google AI or OpenAI pre-trained weights or PyTorch dump
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-``from_pretrained()`` method
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of ``BertForPreTraining`` saved with ``torch.save()``\ ), the PyTorch model classes and the tokenizer can be instantiated using the ``from_pretrained()`` method:
-
-.. code-block:: python
-
-   model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
-
-where
-
-
-* ``BERT_CLASS`` is either a tokenizer to load the vocabulary (\ ``BertTokenizer`` or ``OpenAIGPTTokenizer`` classes) or one of the eight BERT or three OpenAI GPT PyTorch model classes (to load the pre-trained weights): ``BertModel``\ , ``BertForMaskedLM``\ , ``BertForNextSentencePrediction``\ , ``BertForPreTraining``\ , ``BertForSequenceClassification``\ , ``BertForTokenClassification``\ , ``BertForMultipleChoice``\ , ``BertForQuestionAnswering``\ , ``OpenAIGPTModel``\ , ``OpenAIGPTLMHeadModel`` or ``OpenAIGPTDoubleHeadsModel``\ , and
-*
-  ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is either:
-
-
-  *
-    the shortcut name of a Google AI's or OpenAI's pre-trained model selected in the list:
-
-
-    * ``bert-base-uncased``: 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-large-uncased``: 24-layer, 1024-hidden, 16-heads, 340M parameters
-    * ``bert-base-cased``: 12-layer, 768-hidden, 12-heads , 110M parameters
-    * ``bert-large-cased``: 24-layer, 1024-hidden, 16-heads, 340M parameters
-    * ``bert-base-multilingual-uncased``: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-base-multilingual-cased``: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-base-chinese``: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-base-german-cased``: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://deepset.ai/german-bert>`__
-    * ``bert-large-uncased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
-    * ``bert-large-cased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
-    * ``bert-large-uncased-whole-word-masking-finetuned-squad``: The ``bert-large-uncased-whole-word-masking`` model finetuned on SQuAD (using the ``run_bert_squad.py`` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
-    * ``bert-base-german-dbmdz-cased``: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://github.com/dbmdz/german-bert>`__
-    * ``bert-base-german-dbmdz-uncased``: Trained on (uncased) German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://github.com/dbmdz/german-bert>`__
-    * ``openai-gpt``: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``gpt2``: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
-    * ``gpt2-medium``: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
-    * ``transfo-xl-wt103``: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
-
-  *
-    a path or url to a pretrained model archive containing:
-
-
-    * ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
-    * ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
-
-  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
-
-*
-  ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
-
-* ``from_tf``\ : should we load the weights from a locally saved TensorFlow checkpoint
-* ``state_dict``\ : an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
-* ``*inputs``\ , `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
-
-``Uncased`` means that the text has been lowercased before WordPiece tokenization, e.g., ``John Smith`` becomes ``john smith``. The Uncased model also strips out any accent markers. ``Cased`` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the `Multilingual README <https://github.com/google-research/bert/blob/master/multilingual.md>`__ or the original TensorFlow repository.
-
-When using an ``uncased model``\ , make sure to pass ``--do_lower_case`` to the example training scripts (or pass ``do_lower_case=True`` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).
-
-Examples:
-
-.. code-block:: python
-
-   # BERT
-   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
-   model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-
-   # OpenAI GPT
-   tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-   model = OpenAIGPTModel.from_pretrained('openai-gpt')
-
-   # Transformer-XL
-   tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-   model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
-
-   # OpenAI GPT-2
-   tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-   model = GPT2Model.from_pretrained('gpt2')
-
-Cache directory
-~~~~~~~~~~~~~~~
-
-``pytorch_pretrained_bert`` save the pretrained weights in a cache directory which is located at (in this order of priority):
-
-
-* ``cache_dir`` optional arguments to the ``from_pretrained()`` method (see above),
-* shell environment variable ``PYTORCH_PRETRAINED_BERT_CACHE``\ ,
-* PyTorch cache home + ``/pytorch_pretrained_bert/``
-  where PyTorch cache home is defined by (in this order):
-
-  * shell environment variable ``ENV_TORCH_HOME``
-  * shell environment variable ``ENV_XDG_CACHE_HOME`` + ``/torch/``\ )
-  * default: ``~/.cache/torch/``
-
-Usually, if you don't set any specific environment variable, ``pytorch_pretrained_bert`` cache will be at ``~/.cache/torch/pytorch_pretrained_bert/``.
-
-You can alsways safely delete ``pytorch_pretrained_bert`` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
-
-Serialization best-practices
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
-There are three types of files you need to save to be able to reload a fine-tuned model:
-
-
-* the model itself which should be saved following PyTorch serialization `best practices <https://pytorch.org/docs/stable/notes/serialization.html#best-practices>`__\ ,
-* the configuration file of the model which is saved as a JSON file, and
-* the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
-
-The *default filenames* of these files are as follow:
-
-
-* the model weights file: ``pytorch_model.bin``\ ,
-* the configuration file: ``config.json``\ ,
-* the vocabulary file: ``vocab.txt`` for BERT and Transformer-XL, ``vocab.json`` for GPT/GPT-2 (BPE vocabulary),
-* for GPT/GPT-2 (BPE vocabulary) the additional merges file: ``merges.txt``.
-
-**If you save a model using these *default filenames*\ , you can then re-load the model and tokenizer using the ``from_pretrained()`` method.**
-
-Here is the recommended way of saving the model, configuration and vocabulary to an ``output_dir`` directory and reloading the model and tokenizer afterwards:
-
-.. code-block:: python
-
-   from transformers import WEIGHTS_NAME, CONFIG_NAME
-
-   output_dir = "./models/"
-
-   # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
-
-   # If we have a distributed model, save only the encapsulated model
-   # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
-   model_to_save = model.module if hasattr(model, 'module') else model
-
-   # If we save using the predefined names, we can load using `from_pretrained`
-   output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
-   output_config_file = os.path.join(output_dir, CONFIG_NAME)
-
-   torch.save(model_to_save.state_dict(), output_model_file)
-   model_to_save.config.to_json_file(output_config_file)
-   tokenizer.save_vocabulary(output_dir)
-
-   # Step 2: Re-load the saved model and vocabulary
-
-   # Example for a Bert model
-   model = BertForQuestionAnswering.from_pretrained(output_dir)
-   tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case)  # Add specific options if needed
-   # Example for a GPT model
-   model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
-   tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
-
-Here is another way you can save and reload the model if you want to use specific paths for each type of files:
-
-.. code-block:: python
-
-   output_model_file = "./models/my_own_model_file.bin"
-   output_config_file = "./models/my_own_config_file.bin"
-   output_vocab_file = "./models/my_own_vocab_file.bin"
-
-   # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
-
-   # If we have a distributed model, save only the encapsulated model
-   # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
-   model_to_save = model.module if hasattr(model, 'module') else model
-
-   torch.save(model_to_save.state_dict(), output_model_file)
-   model_to_save.config.to_json_file(output_config_file)
-   tokenizer.save_vocabulary(output_vocab_file)
-
-   # Step 2: Re-load the saved model and vocabulary
-
-   # We didn't save using the predefined WEIGHTS_NAME, CONFIG_NAME names, we cannot load using `from_pretrained`.
-   # Here is how to do it in this situation:
-
-   # Example for a Bert model
-   config = BertConfig.from_json_file(output_config_file)
-   model = BertForQuestionAnswering(config)
-   state_dict = torch.load(output_model_file)
-   model.load_state_dict(state_dict)
-   tokenizer = BertTokenizer(output_vocab_file, do_lower_case=args.do_lower_case)
-
-   # Example for a GPT model
-   config = OpenAIGPTConfig.from_json_file(output_config_file)
-   model = OpenAIGPTDoubleHeadsModel(config)
-   state_dict = torch.load(output_model_file)
-   model.load_state_dict(state_dict)
-   tokenizer = OpenAIGPTTokenizer(output_vocab_file)
-
--- a/docs/source/task_summary.rst
+++ b/docs/source/task_summary.rst
@@ -0,0 +1,845 @@
+Summary of the tasks
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This page shows the most frequent use-cases when using the library. The models available allow for many different
+configurations and a great versatility in use-cases. The most simple ones are presented here, showcasing usage
+for tasks such as question answering, sequence classification, named entity recognition and others.
+
+These examples leverage auto-models, which are classes that will instantiate a model according to a given checkpoint,
+automatically selecting the correct model architecture. Please check the :class:`~transformers.AutoModel` documentation
+for more information.
+Feel free to modify the code to be more specific and adapt it to your specific use-case.
+
+In order for a model to perform well on a task, it must be loaded from a checkpoint corresponding to that task. These
+checkpoints are usually pre-trained on a large corpus of data and fine-tuned on a specific task. This means the
+following:
+
+- Not all models were fine-tuned on all tasks. If you want to fine-tune a model on a specific task, you can leverage
+  one of the `run_$TASK.py` script in the
+  `examples <https://github.com/huggingface/transformers/tree/master/examples>`_ directory.
+- Fine-tuned models were fine-tuned on a specific dataset. This dataset may or may not overlap with your use-case
+  and domain. As mentioned previously, you may leverage the
+  `examples <https://github.com/huggingface/transformers/tree/master/examples>`_ scripts to fine-tune your model, or you
+  may create your own training script.
+
+In order to do an inference on a task, several mechanisms are made available by the library:
+
+- Pipelines: very easy-to-use abstractions, which require as little as two lines of code.
+- Using a model directly with a tokenizer (PyTorch/TensorFlow): the full inference using the model. Less abstraction,
+  but much more powerful.
+
+Both approaches are showcased here.
+
+.. note::
+
+    All tasks presented here leverage pre-trained checkpoints that were fine-tuned on specific tasks. Loading a
+    checkpoint that was not fine-tuned on a specific task would load only the base transformer layers and not the
+    additional head that is used for the task, initializing the weights of that head randomly.
+
+    This would produce random output.
+
+Sequence Classification
+--------------------------
+
+Sequence classification is the task of classifying sequences according to a given number of classes. An example
+of sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune
+a model on a GLUE sequence classification task, you may leverage the
+`run_glue.py <https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_glue.py>`_ or
+`run_tf_glue.py <https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_tf_glue.py>`_ scripts.
+
+Here is an example using the pipelines do to sentiment analysis: identifying if a sequence is positive or negative.
+It leverages a fine-tuned model on sst2, which is a GLUE task.
+
+This returns a label ("POSITIVE" or "NEGATIVE") alongside a score, as follows:
+
+.. code-block::
+
+    >>> from transformers import pipeline
+
+    >>> nlp = pipeline("sentiment-analysis")
+
+    >>> result = nlp("I hate you")[0]
+    >>> print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+    label: NEGATIVE, with score: 0.9991
+
+    >>> result = nlp("I love you")[0]
+    >>> print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+    label: POSITIVE, with score: 0.9999
+
+
+Here is an example of doing a sequence classification using a model to determine if two sequences are paraphrases
+of each other. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
+  with the weights stored in the checkpoint.
+- Build a sequence from the two sentences, with the correct model-specific separators token type ids
+  and attention masks (:func:`~transformers.PreTrainedTokenizer.encode` and
+  :func:`~transformers.PreTrainedTokenizer.__call__` take care of this)
+- Pass this sequence through the model so that it is classified in one of the two available classes: 0
+  (not a paraphrase) and 1 (is a paraphrase)
+- Compute the softmax of the result to get probabilities over the classes
+- Print the results
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    >>> import torch
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
+    >>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
+
+    >>> classes = ["not paraphrase", "is paraphrase"]
+
+    >>> sequence_0 = "The company HuggingFace is based in New York City"
+    >>> sequence_1 = "Apples are especially bad for your health"
+    >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
+
+    >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
+    >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")
+
+    >>> paraphrase_classification_logits = model(**paraphrase)[0]
+    >>> not_paraphrase_classification_logits = model(**not_paraphrase)[0]
+
+    >>> paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
+    >>> not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
+
+    >>> # Should be paraphrase
+    >>> for i in range(len(classes)):
+    ...     print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
+    not paraphrase: 10%
+    is paraphrase: 90%
+
+    >>> # Should not be paraphrase
+    >>> for i in range(len(classes)):
+    ...     print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")
+    not paraphrase: 94%
+    is paraphrase: 6%
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
+    >>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
+
+    >>> classes = ["not paraphrase", "is paraphrase"]
+
+    >>> sequence_0 = "The company HuggingFace is based in New York City"
+    >>> sequence_1 = "Apples are especially bad for your health"
+    >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
+
+    >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="tf")
+    >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="tf")
+
+    >>> paraphrase_classification_logits = model(paraphrase)[0]
+    >>> not_paraphrase_classification_logits = model(not_paraphrase)[0]
+
+    >>> paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
+    >>> not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
+
+    >>> # Should be paraphrase
+    >>> for i in range(len(classes)):
+    ...     print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
+    not paraphrase: 10%
+    is paraphrase: 90%
+
+    >>> # Should not be paraphrase
+    >>> for i in range(len(classes)):
+    ...     print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")
+    not paraphrase: 94%
+    is paraphrase: 6%
+
+Extractive Question Answering
+----------------------------------------------------
+
+Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
+question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
+a model on a SQuAD task, you may leverage the `run_squad.py`.
+
+Here is an example using the pipelines do to question answering: extracting an answer from a text given a question.
+It leverages a fine-tuned model on SQuAD.
+
+.. code-block::
+
+    >>> from transformers import pipeline
+
+    >>> nlp = pipeline("question-answering")
+
+    >>> context = r"""
+    ... Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
+    ... question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
+    ... a model on a SQuAD task, you may leverage the examples/question-answering/run_squad.py script.
+    ... """
+
+This returns an answer extracted from the text, a confidence score, alongside "start" and "end" values which
+are the positions of the extracted answer in the text.
+
+.. code-block::
+
+    >>> result = nlp(question="What is extractive question answering?", context=context)
+    >>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
+    Answer: 'the task of extracting an answer from a text given a question.', score: 0.6226, start: 34, end: 96
+
+    >>> result = nlp(question="What is a good example of a question answering dataset?", context=context)
+    >>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
+    Answer: 'SQuAD dataset,', score: 0.5053, start: 147, end: 161
+
+
+Here is an example of question answering using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
+  with the weights stored in the checkpoint.
+- Define a text and a few questions.
+- Iterate over the questions and build a sequence from the text and the current question, with the correct
+  model-specific separators token type ids and attention masks
+- Pass this sequence through the model. This outputs a range of scores across the entire sequence tokens (question and
+  text), for both the start and end positions.
+- Compute the softmax of the result to get probabilities over the tokens
+- Fetch the tokens from the identified start and stop values, convert those tokens to a string.
+- Print the results
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+    >>> import torch
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+    >>> model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+
+    >>> text = r"""
+    ... 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
+    ... architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
+    ... Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+    ... TensorFlow 2.0 and PyTorch.
+    ... """
+
+    >>> questions = [
+    ...     "How many pretrained models are available in 🤗 Transformers?",
+    ...     "What does 🤗 Transformers provide?",
+    ...     "🤗 Transformers provides interoperability between which frameworks?",
+    ... ]
+
+    >>> for question in questions:
+    ...     inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
+    ...     input_ids = inputs["input_ids"].tolist()[0]
+    ...
+    ...     text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+    ...     answer_start_scores, answer_end_scores = model(**inputs)
+    ...
+    ...     answer_start = torch.argmax(
+    ...         answer_start_scores
+    ...     )  # Get the most likely beginning of answer with the argmax of the score
+    ...     answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
+    ...
+    ...     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
+    ...
+    ...     print(f"Question: {question}")
+    ...     print(f"Answer: {answer}")
+    Question: How many pretrained models are available in 🤗 Transformers?
+    Answer: over 32 +
+    Question: What does 🤗 Transformers provide?
+    Answer: general - purpose architectures
+    Question: 🤗 Transformers provides interoperability between which frameworks?
+    Answer: tensorflow 2 . 0 and pytorch
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+    >>> model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+
+    >>> text = r"""
+    ... 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
+    ... architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
+    ... Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+    ... TensorFlow 2.0 and PyTorch.
+    ... """
+
+    >>> questions = [
+    ...     "How many pretrained models are available in 🤗 Transformers?",
+    ...     "What does 🤗 Transformers provide?",
+    ...     "🤗 Transformers provides interoperability between which frameworks?",
+    ... ]
+
+    >>> for question in questions:
+    ...     inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf")
+    ...     input_ids = inputs["input_ids"].numpy()[0]
+    ...
+    ...     text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+    ...     answer_start_scores, answer_end_scores = model(inputs)
+    ...
+    ...     answer_start = tf.argmax(
+    ...         answer_start_scores, axis=1
+    ...     ).numpy()[0]  # Get the most likely beginning of answer with the argmax of the score
+    ...     answer_end = (
+    ...         tf.argmax(answer_end_scores, axis=1) + 1
+    ...     ).numpy()[0]  # Get the most likely end of answer with the argmax of the score
+    ...     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
+    ...
+    ...     print(f"Question: {question}")
+    ...     print(f"Answer: {answer}")
+    Question: How many pretrained models are available in 🤗 Transformers?
+    Answer: over 32 +
+    Question: What does 🤗 Transformers provide?
+    Answer: general - purpose architectures
+    Question: 🤗 Transformers provides interoperability between which frameworks?
+    Answer: tensorflow 2 . 0 and pytorch
+
+
+
+Language Modeling
+----------------------------------------------------
+
+Language modeling is the task of fitting a model to a corpus, which can be domain specific. All popular transformer
+based models are trained using a variant of language modeling, e.g. BERT with masked language modeling, GPT-2 with
+causal language modeling.
+
+Language modeling can be useful outside of pre-training as well, for example to shift the model distribution to be
+domain-specific: using a language model trained over a very large corpus, and then fine-tuning it to a news dataset
+or on scientific papers e.g. `LysandreJik/arxiv-nlp <https://huggingface.co/lysandre/arxiv-nlp>`__.
+
+Masked Language Modeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Masked language modeling is the task of masking tokens in a sequence with a masking token, and prompting the model to
+fill that mask with an appropriate token. This allows the model to attend to both the right context (tokens on the
+right of the mask) and the left context (tokens on the left of the mask). Such a training creates a strong basis
+for downstream tasks requiring bi-directional context such as SQuAD (question answering,
+see `Lewis, Lui, Goyal et al. <https://arxiv.org/abs/1910.13461>`__, part 4.2).
+
+Here is an example of using pipelines to replace a mask from a sequence:
+
+.. code-block::
+
+    >>> from transformers import pipeline
+
+    >>> nlp = pipeline("fill-mask")
+
+This outputs the sequences with the mask filled, the confidence score as well as the token id in the tokenizer
+vocabulary:
+
+.. code-block::
+
+    >>> from pprint import pprint
+    >>> pprint(nlp(f"HuggingFace is creating a {nlp.tokenizer.mask_token} that the community uses to solve NLP tasks."))
+    [{'score': 0.1792745739221573,
+      'sequence': '<s>HuggingFace is creating a tool that the community uses to '
+                  'solve NLP tasks.</s>',
+      'token': 3944,
+      'token_str': 'Ġtool'},
+     {'score': 0.11349421739578247,
+      'sequence': '<s>HuggingFace is creating a framework that the community uses '
+                  'to solve NLP tasks.</s>',
+      'token': 7208,
+      'token_str': 'Ġframework'},
+     {'score': 0.05243554711341858,
+      'sequence': '<s>HuggingFace is creating a library that the community uses to '
+                  'solve NLP tasks.</s>',
+      'token': 5560,
+      'token_str': 'Ġlibrary'},
+     {'score': 0.03493533283472061,
+      'sequence': '<s>HuggingFace is creating a database that the community uses '
+                  'to solve NLP tasks.</s>',
+      'token': 8503,
+      'token_str': 'Ġdatabase'},
+     {'score': 0.02860250137746334,
+      'sequence': '<s>HuggingFace is creating a prototype that the community uses '
+                  'to solve NLP tasks.</s>',
+      'token': 17715,
+      'token_str': 'Ġprototype'}]
+
+Here is an example doing masked language modeling using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a DistilBERT model and
+  loads it with the weights stored in the checkpoint.
+- Define a sequence with a masked token, placing the :obj:`tokenizer.mask_token` instead of a word.
+- Encode that sequence into IDs and find the position of the masked token in that list of IDs.
+- Retrieve the predictions at the index of the mask token: this tensor has the same size as the vocabulary, and the
+  values are the scores attributed to each token. The model gives higher score to tokens he deems probable in that
+  context.
+- Retrieve the top 5 tokens using the PyTorch :obj:`topk` or TensorFlow :obj:`top_k` methods.
+- Replace the mask token by the tokens and print the results
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+    >>> import torch
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+    >>> model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+
+    >>> sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+
+    >>> input = tokenizer.encode(sequence, return_tensors="pt")
+    >>> mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
+
+    >>> token_logits = model(input)[0]
+    >>> mask_token_logits = token_logits[0, mask_token_index, :]
+
+    >>> top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+    >>> model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+
+    >>> sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+
+    >>> input = tokenizer.encode(sequence, return_tensors="tf")
+    >>> mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1]
+
+    >>> token_logits = model(input)[0]
+    >>> mask_token_logits = token_logits[0, mask_token_index, :]
+
+    >>> top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
+
+
+This prints five sequences, with the top 5 tokens predicted by the model:
+
+.. code-block::
+
+    >>> for token in top_5_tokens:
+    ...     print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
+
+
+Causal Language Modeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the
+model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting
+for generation tasks.
+
+Usually, the next token is predicted by sampling from the logits of the last hidden state the model produces from the input sequence.
+
+Here is an example using the tokenizer and model and leveraging the :func:`~transformers.PreTrainedModel.top_k_top_p_filtering` method to sample the next token following an input sequence of tokens.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelWithLMHead, AutoTokenizer, top_k_top_p_filtering
+    >>> import torch
+    >>> from torch.nn import functional as F
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> model = AutoModelWithLMHead.from_pretrained("gpt2")
+
+    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and "
+
+    >>> input_ids = tokenizer.encode(sequence, return_tensors="pt")
+
+    >>> # get logits of last hidden state
+    >>> next_token_logits = model(input_ids)[0][:, -1, :]
+
+    >>> # filter
+    >>> filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
+
+    >>> # sample
+    >>> probs = F.softmax(filtered_next_token_logits, dim=-1)
+    >>> next_token = torch.multinomial(probs, num_samples=1)
+
+    >>> generated = torch.cat([input_ids, next_token], dim=-1)
+
+    >>> resulting_string = tokenizer.decode(generated.tolist()[0])
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer, tf_top_k_top_p_filtering
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> model = TFAutoModelWithLMHead.from_pretrained("gpt2")
+
+    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and "
+
+    >>> input_ids = tokenizer.encode(sequence, return_tensors="tf")
+
+    >>> # get logits of last hidden state
+    >>> next_token_logits = model(input_ids)[0][:, -1, :]
+
+    >>> # filter
+    >>> filtered_next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
+
+    >>> # sample
+    >>> next_token = tf.random.categorical(filtered_next_token_logits, dtype=tf.int32, num_samples=1)
+
+    >>> generated = tf.concat([input_ids, next_token], axis=1)
+
+    >>> resulting_string = tokenizer.decode(generated.numpy().tolist()[0])
+
+
+This outputs a (hopefully) coherent next token following the original sequence, which is in our case is the word *has*:
+
+.. code-block::
+
+    print(resulting_string)
+    Hugging Face is based in DUMBO, New York City, and has
+
+In the next section, we show how this functionality is leveraged in :func:`~transformers.PreTrainedModel.generate` to generate multiple tokens up to a user-defined length.
+
+Text Generation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In text generation (*a.k.a* *open-ended text generation*) the goal is to create a coherent portion of text that is a continuation from the given context. As an example, is it shown how *GPT-2* can be used in pipelines to generate text. As a default all models apply *Top-K* sampling when used in pipelines as configured in their respective configurations (see `gpt-2 config <https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json>`_ for example).
+
+.. code-block::
+
+    >>> from transformers import pipeline
+
+    >>> text_generator = pipeline("text-generation")
+    >>> print(text_generator("As far as I am concerned, I will", max_length=50, do_sample=False))
+    [{'generated_text': 'As far as I am concerned, I will be the first to admit that I am not a fan of the idea of a "free market." I think that the idea of a free market is a bit of a stretch. I think that the idea'}]
+
+
+
+Here the model generates a random text with a total maximal length of *50* tokens from context *"As far as I am concerned, I will"*.
+The default arguments of ``PreTrainedModel.generate()`` can directly be overriden in the pipeline as is shown above for the argument ``max_length``.
+
+Here is an example for text generation using XLNet and its tokenzier. 
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+
+    >>> model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased")
+    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
+
+    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
+    >>> PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
+    ... (except for Alexei and Maria) are discovered.
+    ... The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+    ... remainder of the story. 1883 Western Siberia,
+    ... a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+    ... Rasputin has a vision and denounces one of the men as a horse thief. Although his
+    ... father initially slaps him for making such an accusation, Rasputin watches as the
+    ... man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    ... the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+    ... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
+
+    >>> prompt = "Today the weather is really nice and I am planning on "
+    >>> inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")
+
+    >>> prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
+    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
+
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
+
+    >>> model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased")
+    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
+
+    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
+    >>> PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
+    ... (except for Alexei and Maria) are discovered.
+    ... The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+    ... remainder of the story. 1883 Western Siberia,
+    ... a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+    ... Rasputin has a vision and denounces one of the men as a horse thief. Although his
+    ... father initially slaps him for making such an accusation, Rasputin watches as the
+    ... man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    ... the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+    ... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
+
+    >>> prompt = "Today the weather is really nice and I am planning on "
+    >>> inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf")
+
+    >>> prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
+    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
+
+.. code-block::
+
+    print(generated)
+
+Text generation is currently possible with *GPT-2*, *OpenAi-GPT*, *CTRL*, *XLNet*, *Transfo-XL* and *Reformer* in PyTorch and for most models in Tensorflow as well. As can be seen in the example above *XLNet* and *Transfo-xl* often need to be padded to work well.
+GPT-2 is usually a good choice for *open-ended text generation* because it was trained on millions on webpages with a causal language modeling objective.
+
+For more information on how to apply different decoding strategies for text generation, please also refer to our generation blog post `here <https://huggingface.co/blog/how-to-generate>`_.
+
+
+Named Entity Recognition
+----------------------------------------------------
+
+Named Entity Recognition (NER) is the task of classifying tokens according to a class, for example identifying a
+token as a person, an organisation or a location.
+An example of a named entity recognition dataset is the CoNLL-2003 dataset, which is entirely based on that task.
+If you would like to fine-tune a model on an NER task, you may leverage the `ner/run_ner.py` (PyTorch),
+`ner/run_pl_ner.py` (leveraging pytorch-lightning) or the `ner/run_tf_ner.py` (TensorFlow) scripts.
+
+Here is an example using the pipelines do to named entity recognition, trying to identify tokens as belonging to one
+of 9 classes:
+
+- O, Outside of a named entity
+- B-MIS, Beginning of a miscellaneous entity right after another miscellaneous entity
+- I-MIS, Miscellaneous entity
+- B-PER, Beginning of a person's name right after another person's name
+- I-PER, Person's name
+- B-ORG, Beginning of an organisation right after another organisation
+- I-ORG, Organisation
+- B-LOC, Beginning of a location right after another location
+- I-LOC, Location
+
+It leverages a fine-tuned model on CoNLL-2003, fine-tuned by `@stefan-it <https://github.com/stefan-it>`__ from
+`dbmdz <https://github.com/dbmdz>`__.
+
+.. code-block::
+
+    >>> from transformers import pipeline
+
+    >>> nlp = pipeline("ner")
+
+    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very"
+    ...            "close to the Manhattan Bridge which is visible from the window."
+
+
+This outputs a list of all words that have been identified as an entity from the 9 classes defined above. Here is the
+expected results:
+
+.. code-block::
+
+    print(nlp(sequence))
+
+    [
+        {'word': 'Hu', 'score': 0.9995632767677307, 'entity': 'I-ORG'},
+        {'word': '##gging', 'score': 0.9915938973426819, 'entity': 'I-ORG'},
+        {'word': 'Face', 'score': 0.9982671737670898, 'entity': 'I-ORG'},
+        {'word': 'Inc', 'score': 0.9994403719902039, 'entity': 'I-ORG'},
+        {'word': 'New', 'score': 0.9994346499443054, 'entity': 'I-LOC'},
+        {'word': 'York', 'score': 0.9993270635604858, 'entity': 'I-LOC'},
+        {'word': 'City', 'score': 0.9993864893913269, 'entity': 'I-LOC'},
+        {'word': 'D', 'score': 0.9825621843338013, 'entity': 'I-LOC'},
+        {'word': '##UM', 'score': 0.936983048915863, 'entity': 'I-LOC'},
+        {'word': '##BO', 'score': 0.8987102508544922, 'entity': 'I-LOC'},
+        {'word': 'Manhattan', 'score': 0.9758241176605225, 'entity': 'I-LOC'},
+        {'word': 'Bridge', 'score': 0.990249514579773, 'entity': 'I-LOC'}
+    ]
+
+Note how the words "Hugging Face" have been identified as an organisation, and "New York City", "DUMBO" and
+"Manhattan Bridge" have been identified as locations.
+
+Here is an example doing named entity recognition using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and
+  loads it with the weights stored in the checkpoint.
+- Define the label list with which the model was trained on.
+- Define a sequence with known entities, such as "Hugging Face" as an organisation and "New York City" as a location.
+- Split words into tokens so that they can be mapped to the predictions. We use a small hack by firstly completely
+  encoding and decoding the sequence, so that we're left with a string that contains the special tokens.
+- Encode that sequence into IDs (special tokens are added automatically).
+- Retrieve the predictions by passing the input to the model and getting the first output. This results in a
+  distribution over the 9 possible classes for each token. We take the argmax to retrieve the most likely class
+  for each token.
+- Zip together each token with its prediction and print it.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelForTokenClassification, AutoTokenizer
+    >>> import torch
+
+    >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+    >>> label_list = [
+    ...     "O",       # Outside of a named entity
+    ...     "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
+    ...     "I-MISC",  # Miscellaneous entity
+    ...     "B-PER",   # Beginning of a person's name right after another person's name
+    ...     "I-PER",   # Person's name
+    ...     "B-ORG",   # Beginning of an organisation right after another organisation
+    ...     "I-ORG",   # Organisation
+    ...     "B-LOC",   # Beginning of a location right after another location
+    ...     "I-LOC"    # Location
+    ... ]
+
+    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+    ...            "close to the Manhattan Bridge."
+
+    >>> # Bit of a hack to get the tokens with the special tokens
+    >>> tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
+    >>> inputs = tokenizer.encode(sequence, return_tensors="pt")
+
+    >>> outputs = model(inputs)[0]
+    >>> predictions = torch.argmax(outputs, dim=2)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelForTokenClassification, AutoTokenizer
+    >>> import tensorflow as tf
+
+    >>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+    >>> label_list = [
+    ...     "O",       # Outside of a named entity
+    ...     "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
+    ...     "I-MISC",  # Miscellaneous entity
+    ...     "B-PER",   # Beginning of a person's name right after another person's name
+    ...     "I-PER",   # Person's name
+    ...     "B-ORG",   # Beginning of an organisation right after another organisation
+    ...     "I-ORG",   # Organisation
+    ...     "B-LOC",   # Beginning of a location right after another location
+    ...     "I-LOC"    # Location
+    ... ]
+
+    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+    ...            "close to the Manhattan Bridge."
+
+    >>> # Bit of a hack to get the tokens with the special tokens
+    >>> tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
+    >>> inputs = tokenizer.encode(sequence, return_tensors="tf")
+
+    >>> outputs = model(inputs)[0]
+    >>> predictions = tf.argmax(outputs, axis=2)
+
+
+This outputs a list of each token mapped to their prediction. Differently from the pipeline, here every token has
+a prediction as we didn't remove the "0" class which means that no particular entity was found on that token. The
+following array should be the output:
+
+.. code-block::
+
+    >>> print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())])
+    [('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]
+
+Summarization
+----------------------------------------------------
+
+Summarization is the task of summarizing a text / an article into a shorter text.
+
+An example of a summarization dataset is the CNN / Daily Mail dataset, which consists of long news articles and was created for the task of summarization.
+If you would like to fine-tune a model on a summarization task, you may leverage the ``examples/summarization/bart/run_train.sh`` (leveraging pytorch-lightning) script.
+
+Here is an example using the pipelines do to summarization. 
+It leverages a Bart model that was fine-tuned on the CNN / Daily Mail data set.
+
+.. code-block::
+
+    >>> from transformers import pipeline
+
+    >>> summarizer = pipeline("summarization")
+
+    >>> ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
+    ... A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
+    ... Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
+    ... In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
+    ... Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
+    ... 2010 marriage license application, according to court documents.
+    ... Prosecutors said the marriages were part of an immigration scam.
+    ... On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further.
+    ... After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective
+    ... Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.
+    ... All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.
+    ... Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.
+    ... Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.
+    ... The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s
+    ... Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.
+    ... Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.
+    ... If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.
+    ... """
+
+Because the summarization pipeline depends on the ``PretrainedModel.generate()`` method, we can override the default arguments 
+of ``PretrainedModel.generate()`` directly in the pipeline as is shown for ``max_length`` and ``min_length`` above.
+This outputs the following summary:
+
+.. code-block::
+
+    >>> print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))
+    [{'summary_text': 'Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and 2002. She is believed to still be married to four men.'}]
+
+Here is an example doing summarization using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder model, such as ``Bart`` or ``T5``.
+- Define the article that should be summarizaed.
+- Leverage the ``PretrainedModel.generate()`` method.
+- Add the T5 specific prefix "summarize: ".
+
+Here Google`s T5 model is used that was only pre-trained on a multi-task mixed data set (including CNN / Daily Mail), but nevertheless yields very good results.
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+
+    >>> model = AutoModelWithLMHead.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    >>> # T5 uses a max_length of 512 so we cut the article to 512 tokens.
+    >>> inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="pt", max_length=512)
+    >>> outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
+
+    >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    >>> # T5 uses a max_length of 512 so we cut the article to 512 tokens.
+    >>> inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="tf", max_length=512)
+    >>> outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
+
+Translation
+----------------------------------------------------
+
+Translation is the task of translating a text from one language to another.
+
+An example of a translation dataset is the WMT English to German dataset, which has English sentences as the input data 
+and German sentences as the target data.
+
+Here is an example using the pipelines do to translation. 
+It leverages a T5 model that was only pre-trained on a multi-task mixture dataset (including WMT), but yields impressive 
+translation results nevertheless.
+
+.. code-block::
+
+    >>> from transformers import pipeline
+
+    >>> translator = pipeline("translation_en_to_de")
+    >>> print(translator("Hugging Face is a technology company based in New York and Paris", max_length=40))
+    [{'translation_text': 'Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.'}]
+
+Because the translation pipeline depends on the ``PretrainedModel.generate()`` method, we can override the default arguments 
+of ``PretrainedModel.generate()`` directly in the pipeline as is shown for ``max_length`` above.
+This outputs the following translation into German:
+
+::
+
+  Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.
+  
+Here is an example doing translation using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder model, such as ``Bart`` or ``T5``.
+- Define the article that should be summarizaed.
+- Leverage the ``PretrainedModel.generate()`` method.
+- Add the T5 specific prefix "translate English to German: "
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+
+    >>> model = AutoModelWithLMHead.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    >>> inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="pt")
+    >>> outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
+
+    >>> print(outputs)
+    tensor([[    0, 11560,  3896,  8881,   229,   236,     3, 14366, 15377,   181,
+             11216,    16,   368,  1060,    64,  1919,     5]])
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
+
+    >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    >>> inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="tf")
+    >>> outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
+
+    >>> print(outputs)
+    tf.Tensor(
+    [[    0 11560  3896  8881   229   236     3 14366 15377   181 11216    16
+        368  1060    64  1919     5]], shape=(1, 17), dtype=int32)
--- a/docs/source/torchscript.rst
+++ b/docs/source/torchscript.rst
@@ -12,7 +12,7 @@ According to Pytorch's documentation: "TorchScript is a way to create serializab
 Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
 their model to be re-used in other programs, such as efficiency-oriented C++ programs.

-We have provided an interface that allows the export of `transformers` models to TorchScript so that they can
+We have provided an interface that allows the export of 🤗 Transformers models to TorchScript so that they can
 be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that
 they can be exported, and what to be mindful of when using these models with TorchScript.

--- a/docs/source/training.rst
+++ b/docs/source/training.rst
@@ -0,0 +1,323 @@
+Training and fine-tuning
+========================
+
+Model classes in 🤗 Transformers are designed to be compatible with native
+PyTorch and TensorFlow 2 and can be used seemlessly with either. In this
+quickstart, we will show how to fine-tune (or train from scratch) a model
+using the standard training tools available in either framework. We will also
+show how to use our included :func:`~transformers.Trainer` class which
+handles much of the complexity of training for you.
+
+This guide assume that you are already familiar with loading and use our
+models for inference; otherwise, see the :doc:`task summary <task_summary>`. We also assume
+that you are familiar with training deep neural networks in either PyTorch or
+TF2, and focus specifically on the nuances and tools for training models in
+🤗 Transformers.
+
+Sections:
+
+  * :ref:`pytorch`
+  * :ref:`tensorflow`
+  * :ref:`trainer`
+  * :ref:`additional-resources`
+
+.. _pytorch:
+
+Fine-tuning in native PyTorch
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Model classes in 🤗 Transformers that don't begin with ``TF`` are
+`PyTorch Modules <https://pytorch.org/docs/master/generated/torch.nn.Module.html>`_,
+meaning that you can use them just as you would any model in PyTorch for
+both inference and optimization.
+
+Let's consider the common task of fine-tuning a masked language model like
+BERT on a sequence classification dataset. When we instantiate a model with
+:func:`~transformers.PreTrainedModel.from_pretrained`, the model
+configuration and pre-trained weights
+of the specified model are used to initialize the model. The
+library also includes a number of task-specific final layers or 'heads' whose
+weights are instantiated randomly when not present in the specified
+pre-trained model. For example, instantiating a model with
+``BertForSequenceClassification.from_pretrained('bert-base-uncased', num_classes=2)``
+will create a BERT model instance with encoder weights copied from the
+``bert-base-uncased`` model and a randomly initialized sequence
+classification head on top of the encoder with an output size of 2. Models
+are initialized in ``eval`` mode by default. We can call ``model.train()`` to
+put it in train mode.
+
+.. code-block:: python
+
+    from transformers import BertForSequenceClassification
+    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+    model.train()
+
+This is useful because it allows us to make use of the pre-trained BERT
+encoder and easily train it on whatever sequence classification dataset we
+choose. We can use any PyTorch optimizer, but our library also provides the
+:func:`~transformers.AdamW` optimizer which implements gradient bias
+correction as well as weight decay.
+
+.. code-block:: python
+
+    from transformers import AdamW
+    optimizer = AdamW(model.parameters(), lr=1e-5)
+
+The optimizer allows us to apply different hyperpameters for specific
+parameter groups. For example, we can apply weight decay to all parameters
+other than bias and layer normalization terms:
+
+.. code-block:: python
+
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)
+    
+Now we can set up a simple dummy training batch using
+:func:`~transformers.PreTrainedTokenizer.__call__`. This returns a
+:func:`~transformers.BatchEncoding` instance which
+prepares everything we might need to pass to the model.
+
+.. code-block:: python
+
+    from transformers import BertTokenizer
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    text_batch = ["I love Pixar.", "I don't care for Pixar."]
+    encoding = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True)
+    input_ids = encoding['input_ids']
+    attention_mask = encoding['attention_mask']
+
+When we call a classification model with the ``labels`` argument, the first
+returned element is the Cross Entropy loss between the predictions and the
+passed labels. Having already set up our optimizer, we can then do a
+backwards pass and update the weights:
+
+.. code-block:: python
+
+    labels = torch.tensor([1,0]).unsqueeze(0)
+    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
+    loss = outputs[0]
+    loss.backward()
+    optimizer.step()
+
+Alternatively, you can just get the logits and calculate the loss yourself.
+The following is equivalent to the previous example:
+
+.. code-block:: python
+
+    from torch.nn import functional as F
+    labels = torch.tensor([1,0]).unsqueeze(0)
+    outputs = model(input_ids, attention_mask=attention_mask)
+    loss = F.cross_entropy(labels, outputs[0])
+    loss.backward()
+    optimizer.step()
+
+Of course, you can train on GPU by calling ``to('cuda')`` on the model and
+inputs as usual.
+
+We also provide a few learning rate scheduling tools. With the following, we
+can set up a scheduler which warms up for ``num_warmup_steps`` and then
+linearly decays to 0 by the end of training.
+
+.. code-block:: python
+
+    from transformers import get_linear_schedule_with_warmup
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_train_steps)
+
+Then all we have to do is call ``scheduler.step()`` after ``optimizer.step()``.
+
+.. code-block:: python
+
+    ...
+    loss.backward()
+    optimizer.step()
+    scheduler.step()
+
+We highly recommend using :func:`~transformers.Trainer`, discussed below,
+which conveniently handles the moving parts of training 🤗 Transformers models
+with features like mixed precision and easy tensorboard logging.
+
+
+Freezing the encoder
+--------------------
+
+In some cases, you might be interested in keeping the weights of the
+pre-trained encoder frozen and optimizing only the weights of the head
+layers. To do so, simply set the ``requires_grad`` attribute to ``False`` on
+the encoder parameters, which can be accessed with the ``base_model``
+submodule on any task-specific model in the library:
+
+.. code-block:: python
+   
+    for param in model.base_model.parameters():
+        param.requires_grad = False
+
+
+.. _tensorflow:
+
+Fine-tuning in native TensorFlow 2
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Models can also be trained natively in TensorFlow 2. Just as with PyTorch,
+TensorFlow models can be instantiated with
+:func:`~transformers.PreTrainedModel.from_pretrained` to load the weights of
+the encoder from a pretrained model.
+
+.. code-block:: python
+
+    from transformers import TFBertForSequenceClassification
+    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+Let's use ``tensorflow_datasets`` to load in the `MRPC dataset
+<https://www.tensorflow.org/datasets/catalog/glue#gluemrpc>`_ from GLUE. We
+can then use our built-in
+:func:`~transformers.data.processors.glue.glue_convert_examples_to_features`
+to tokenize MRPC and convert it to a TensorFlow ``Dataset`` object. Note that
+tokenizers are framework-agnostic, so there is no need to prepend ``TF`` to
+the pretrained tokenizer name.
+
+.. code-block:: python
+
+    from transformers import BertTokenizer, glue_convert_examples_to_features
+    import tensorflow_datasets as tfds
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    data = tfds.load('glue/mrpc')
+    train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
+    train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
+
+The model can then be compiled and trained as any Keras model:
+
+.. code-block:: python
+    
+    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
+    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    model.compile(optimizer=optimizer, loss=loss)
+    model.fit(train_dataset, epochs=2, steps_per_epoch=115)
+
+With the tight interoperability between TensorFlow and PyTorch models, you
+can even save the model and then reload it as a PyTorch model (or vice-versa):
+
+.. code-block:: python
+
+    from transformers import BertForSequenceClassification
+    model.save_pretrained('./my_mrpc_model/')
+    pytorch_model = BertForSequenceClassification.from_pretrained('./my_mrpc_model/', from_tf=True)
+
+
+.. _trainer:
+
+Trainer
+^^^^^^^
+
+We also provide a simple but feature-complete training and evaluation
+interface through :func:`~transformers.Trainer` and
+:func:`~transformers.TFTrainer`. You can train, fine-tune,
+and evaluate any 🤗 Transformers model with a wide range of training options and
+with built-in features like logging, gradient accumulation, and mixed
+precision.
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    from transformers import BertForSequenceClassification, Trainer, TrainingArguments
+
+    model = BertForSequenceClassification.from_pretrained("bert-large-uncased")
+
+    training_args = TrainingArguments(
+        output_dir='./results',          # output directory
+        num_train_epochs=3,              # total # of training epochs
+        per_device_train_batch_size=16,  # batch size per device during training
+        per_device_eval_batch_size=64,   # batch size for evaluation
+        warmup_steps=500,                # number of warmup steps for learning rate scheduler
+        weight_decay=0.01,               # strength of weight decay
+        logging_dir='./logs',            # directory for storing logs
+    )
+
+    trainer = Trainer(
+        model=model,                         # the instantiated 🤗 Transformers model to be trained
+        args=training_args,                  # training arguments, defined above
+        train_dataset=train_dataset,         # training dataset
+        eval_dataset=test_dataset            # evaluation dataset
+    )
+    ## TENSORFLOW CODE
+    from transformers import TFBertForSequenceClassification, TFTrainer, TFTrainingArguments
+
+    model = TFBertForSequenceClassification.from_pretrained("bert-large-uncased")
+
+    training_args = TFTrainingArguments(
+        output_dir='./results',          # output directory
+        num_train_epochs=3,              # total # of training epochs
+        per_device_train_batch_size=16,  # batch size per device during training
+        per_device_eval_batch_size=64,   # batch size for evaluation
+        warmup_steps=500,                # number of warmup steps for learning rate scheduler
+        weight_decay=0.01,               # strength of weight decay
+        logging_dir='./logs',            # directory for storing logs
+    )
+
+    trainer = TFTrainer(
+        model=model,                         # the instantiated 🤗 Transformers model to be trained
+        args=training_args,                  # training arguments, defined above
+        train_dataset=tfds_train_dataset,    # tensorflow_datasets training dataset
+        eval_dataset=tfds_test_dataset       # tensorflow_datasets evaluation dataset
+    )
+
+Now simply call ``trainer.train()`` to train and ``trainer.evaluate()`` to
+evaluate. You can use your own module as well, but the first
+argument returned from ``forward`` must be the loss which you wish to
+optimize.
+
+:func:`~transformers.Trainer` uses a built-in default function to collate
+batches and prepare them to be fed into the model. If needed, you can also
+use the ``data_collator`` argument to pass your own collator function which
+takes in the data in the format provides by your dataset and returns a
+batch ready to be fed into the model. Note that
+:func:`~transformers.TFTrainer` expects the passed datasets to be dataset
+objects from ``tensorflow_datasets``.
+
+To calculate additional metrics in addition to the loss, you can also define
+your own ``compute_metrics`` function and pass it to the trainer.
+
+.. code-block:: python
+
+    from sklearn.metrics import precision_recall_fscore_support
+
+    def compute_metrics(pred):
+        labels = pred.label_ids
+        preds = pred.predictions.argmax(-1)
+        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
+        acc = accuracy_score(labels, preds)
+        return {
+            'accuracy': acc,
+            'f1': f1,
+            'precision': precision,
+            'recall': recall
+        }
+
+Finally, you can view the results, including any calculated metrics, by
+launching tensorboard in your specified ``logging_dir`` directory.
+
+
+.. _additional-resources:
+
+Additional resources
+^^^^^^^^^^^^^^^^^^^^
+
+    * `A lightweight colab demo
+      <https://colab.research.google.com/drive/1-JIJlao4dI-Ilww_NnTc0rxtp-ymgDgM?usp=sharing>`_
+      which uses ``Trainer`` for IMDb sentiment classification.
+
+    * `🤗 Transformers Examples <https://github.com/huggingface/transformers/tree/master/examples>`_
+      including scripts for training and fine-tuning on GLUE, SQuAD, and
+      several other tasks.
+
+    * `How to train a language model
+      <https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb>`_,
+      a detailed colab notebook which uses ``Trainer`` to train a masked
+      language model from scratch on Esperanto.
+
+    * `🤗 Transformers Notebooks <./notebooks.html>`_ which contain dozens
+      of example notebooks from the community for training and using
+      🤗 Transformers on a variety of tasks.
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -1,597 +0,0 @@
-Usage
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-This page shows the most frequent use-cases when using the library. The models available allow for many different
-configurations and a great versatility in use-cases. The most simple ones are presented here, showcasing usage
-for tasks such as question answering, sequence classification, named entity recognition and others.
-
-These examples leverage auto-models, which are classes that will instantiate a model according to a given checkpoint,
-automatically selecting the correct model architecture. Please check the :class:`~transformers.AutoModel` documentation
-for more information.
-Feel free to modify the code to be more specific and adapt it to your specific use-case.
-
-In order for a model to perform well on a task, it must be loaded from a checkpoint corresponding to that task. These
-checkpoints are usually pre-trained on a large corpus of data and fine-tuned on a specific task. This means the
-following:
-
- Not all models were fine-tuned on all tasks. If you want to fine-tune a model on a specific task, you can leverage
-  one of the `run_$TASK.py` script in the
-  `examples <https://github.com/huggingface/transformers/tree/master/examples>`_ directory.
- Fine-tuned models were fine-tuned on a specific dataset. This dataset may or may not overlap with your use-case
-  and domain. As mentioned previously, you may leverage the
-  `examples <https://github.com/huggingface/transformers/tree/master/examples>`_ scripts to fine-tune your model, or you
-  may create your own training script.
-
-In order to do an inference on a task, several mechanisms are made available by the library:
-
- Pipelines: very easy-to-use abstractions, which require as little as two lines of code.
- Using a model directly with a tokenizer (PyTorch/TensorFlow): the full inference using the model. Less abstraction,
-  but much more powerful.
-
-Both approaches are showcased here.
-
-.. note::
-
-    All tasks presented here leverage pre-trained checkpoints that were fine-tuned on specific tasks. Loading a
-    checkpoint that was not fine-tuned on a specific task would load only the base transformer layers and not the
-    additional head that is used for the task, initializing the weights of that head randomly.
-
-    This would produce random output.
-
-Sequence Classification
--------------------------
-
-Sequence classification is the task of classifying sequences according to a given number of classes. An example
-of sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune
-a model on a GLUE sequence classification task, you may leverage the
-`run_glue.py <https://github.com/huggingface/transformers/tree/master/examples/run_glue.py>`_ or
-`run_tf_glue.py <https://github.com/huggingface/transformers/tree/master/examples/run_tf_glue.py>`_ scripts.
-
-Here is an example using the pipelines do to sentiment analysis: identifying if a sequence is positive or negative.
-It leverages a fine-tuned model on sst2, which is a GLUE task.
-
-::
-
-    from transformers import pipeline
-
-    nlp = pipeline("sentiment-analysis")
-
-    print(nlp("I hate you"))
-    print(nlp("I love you"))
-
-This returns a label ("POSITIVE" or "NEGATIVE") alongside a score, as follows:
-
-::
-
-    [{'label': 'NEGATIVE', 'score': 0.9991129}]
-    [{'label': 'POSITIVE', 'score': 0.99986565}]
-
-
-Here is an example of doing a sequence classification using a model to determine if two sequences are paraphrases
-of each other. The process is the following:
-
- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
-  with the weights stored in the checkpoint.
- Build a sequence from the two sentences, with the correct model-specific separators token type ids
-  and attention masks (:func:`~transformers.PreTrainedTokenizer.encode` and
-  :func:`~transformers.PreTrainedTokenizer.encode_plus` take care of this)
- Pass this sequence through the model so that it is classified in one of the two available classes: 0
-  (not a paraphrase) and 1 (is a paraphrase)
- Compute the softmax of the result to get probabilities over the classes
- Print the results
-
-::
-
-    ## PYTORCH CODE
-    from transformers import AutoTokenizer, AutoModelForSequenceClassification
-    import torch
-
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
-    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
-
-    classes = ["not paraphrase", "is paraphrase"]
-
-    sequence_0 = "The company HuggingFace is based in New York City"
-    sequence_1 = "Apples are especially bad for your health"
-    sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
-
-    paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="pt")
-    not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="pt")
-
-    paraphrase_classification_logits = model(**paraphrase)[0]
-    not_paraphrase_classification_logits = model(**not_paraphrase)[0]
-
-    paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
-    not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
-
-    print("Should be paraphrase")
-    for i in range(len(classes)):
-        print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")
-
-    print("\nShould not be paraphrase")
-    for i in range(len(classes)):
-        print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%")
-    ## TENSORFLOW CODE
-    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-    import tensorflow as tf
-
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
-    model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
-
-    classes = ["not paraphrase", "is paraphrase"]
-
-    sequence_0 = "The company HuggingFace is based in New York City"
-    sequence_1 = "Apples are especially bad for your health"
-    sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
-
-    paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="tf")
-    not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="tf")
-
-    paraphrase_classification_logits = model(paraphrase)[0]
-    not_paraphrase_classification_logits = model(not_paraphrase)[0]
-
-    paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
-    not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
-
-    print("Should be paraphrase")
-    for i in range(len(classes)):
-        print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")
-
-    print("\nShould not be paraphrase")
-    for i in range(len(classes)):
-        print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%")
-
-This outputs the following results:
-
-::
-
-    Should be paraphrase
-    not paraphrase: 10%
-    is paraphrase: 90%
-
-    Should not be paraphrase
-    not paraphrase: 94%
-    is paraphrase: 6%
-
-Extractive Question Answering
----------------------------------------------------
-
-Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
-question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
-a model on a SQuAD task, you may leverage the `run_squad.py`.
-
-Here is an example using the pipelines do to question answering: extracting an answer from a text given a question.
-It leverages a fine-tuned model on SQuAD.
-
-::
-
-    from transformers import pipeline
-
-    nlp = pipeline("question-answering")
-
-    context = r"""
-    Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
-    question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
-    a model on a SQuAD task, you may leverage the `run_squad.py`.
-    """
-
-    print(nlp(question="What is extractive question answering?", context=context))
-    print(nlp(question="What is a good example of a question answering dataset?", context=context))
-
-This returns an answer extracted from the text, a confidence score, alongside "start" and "end" values which
-are the positions of the extracted answer in the text.
-
-::
-
-    {'score': 0.622232091629833, 'start': 34, 'end': 96, 'answer': 'the task of extracting an answer from a text given a question.'}
-    {'score': 0.5115299158662765, 'start': 147, 'end': 161, 'answer': 'SQuAD dataset,'}
-
-
-Here is an example of question answering using a model and a tokenizer. The process is the following:
-
- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
-  with the weights stored in the checkpoint.
- Define a text and a few questions.
- Iterate over the questions and build a sequence from the text and the current question, with the correct
-  model-specific separators token type ids and attention masks
- Pass this sequence through the model. This outputs a range of scores across the entire sequence tokens (question and
-  text), for both the start and end positions.
- Compute the softmax of the result to get probabilities over the tokens
- Fetch the tokens from the identified start and stop values, convert those tokens to a string.
- Print the results
-
-::
-
-    ## PYTORCH CODE
-    from transformers import AutoTokenizer, AutoModelForQuestionAnswering
-    import torch
-
-    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
-    model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
-
-    text = r"""
-    🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
-    architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
-    Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
-    TensorFlow 2.0 and PyTorch.
-    """
-
-    questions = [
-        "How many pretrained models are available in Transformers?",
-        "What does Transformers provide?",
-        "Transformers provides interoperability between which frameworks?",
-    ]
-
-    for question in questions:
-        inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
-        input_ids = inputs["input_ids"].tolist()[0]
-
-        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer_start_scores, answer_end_scores = model(**inputs)
-
-        answer_start = torch.argmax(
-            answer_start_scores
-        )  # Get the most likely beginning of answer with the argmax of the score
-        answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
-
-        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
-
-        print(f"Question: {question}")
-        print(f"Answer: {answer}\n")
-    ## TENSORFLOW CODE
-    from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
-    import tensorflow as tf
-
-    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
-    model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
-
-    text = r"""
-    🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
-    architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
-    Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
-    TensorFlow 2.0 and PyTorch.
-    """
-
-    questions = [
-        "How many pretrained models are available in Transformers?",
-        "What does Transformers provide?",
-        "Transformers provides interoperability between which frameworks?",
-    ]
-
-    for question in questions:
-        inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="tf")
-        input_ids = inputs["input_ids"].numpy()[0]
-
-        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer_start_scores, answer_end_scores = model(inputs)
-
-        answer_start = tf.argmax(
-            answer_start_scores, axis=1
-        ).numpy()[0]  # Get the most likely beginning of answer with the argmax of the score
-        answer_end = (
-            tf.argmax(answer_end_scores, axis=1) + 1
-        ).numpy()[0]  # Get the most likely end of answer with the argmax of the score
-        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
-
-        print(f"Question: {question}")
-        print(f"Answer: {answer}\n")
-
-This outputs the questions followed by the predicted answers:
-
-::
-
-    Question: How many pretrained models are available in Transformers?
-    Answer: over 32 +
-
-    Question: What does Transformers provide?
-    Answer: general - purpose architectures
-
-    Question: Transformers provides interoperability between which frameworks?
-    Answer: tensorflow 2 . 0 and pytorch
-
-
-
-Language Modeling
----------------------------------------------------
-
-Language modeling is the task of fitting a model to a corpus, which can be domain specific. All popular transformer
-based models are trained using a variant of language modeling, e.g. BERT with masked language modeling, GPT-2 with
-causal language modeling.
-
-Language modeling can be useful outside of pre-training as well, for example to shift the model distribution to be
-domain-specific: using a language model trained over a very large corpus, and then fine-tuning it to a news dataset
-or on scientific papers e.g. `LysandreJik/arxiv-nlp <https://huggingface.co/lysandre/arxiv-nlp>`__.
-
-Masked Language Modeling
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Masked language modeling is the task of masking tokens in a sequence with a masking token, and prompting the model to
-fill that mask with an appropriate token. This allows the model to attend to both the right context (tokens on the
-right of the mask) and the left context (tokens on the left of the mask). Such a training creates a strong basis
-for downstream tasks requiring bi-directional context such as SQuAD (question answering,
-see `Lewis, Lui, Goyal et al. <https://arxiv.org/abs/1910.13461>`__, part 4.2).
-
-Here is an example of using pipelines to replace a mask from a sequence:
-
-::
-
-    from transformers import pipeline
-
-    nlp = pipeline("fill-mask")
-    print(nlp(f"HuggingFace is creating a {nlp.tokenizer.mask_token} that the community uses to solve NLP tasks."))
-
-This outputs the sequences with the mask filled, the confidence score as well as the token id in the tokenizer
-vocabulary:
-
-::
-
-    [
-        {'sequence': '<s> HuggingFace is creating a tool that the community uses to solve NLP tasks.</s>', 'score': 0.15627853572368622, 'token': 3944},
-        {'sequence': '<s> HuggingFace is creating a framework that the community uses to solve NLP tasks.</s>', 'score': 0.11690319329500198, 'token': 7208},
-        {'sequence': '<s> HuggingFace is creating a library that the community uses to solve NLP tasks.</s>', 'score': 0.058063216507434845, 'token': 5560},
-        {'sequence': '<s> HuggingFace is creating a database that the community uses to solve NLP tasks.</s>', 'score': 0.04211743175983429, 'token': 8503},
-        {'sequence': '<s> HuggingFace is creating a prototype that the community uses to solve NLP tasks.</s>', 'score': 0.024718601256608963, 'token': 17715}
-    ]
-
-Here is an example doing masked language modeling using a model and a tokenizer. The process is the following:
-
- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a DistilBERT model and
-  loads it with the weights stored in the checkpoint.
- Define a sequence with a masked token, placing the :obj:`tokenizer.mask_token` instead of a word.
- Encode that sequence into IDs and find the position of the masked token in that list of IDs.
- Retrieve the predictions at the index of the mask token: this tensor has the same size as the vocabulary, and the
-  values are the scores attributed to each token. The model gives higher score to tokens he deems probable in that
-  context.
- Retrieve the top 5 tokens using the PyTorch :obj:`topk` or TensorFlow :obj:`top_k` methods.
- Replace the mask token by the tokens and print the results
-
-::
-
-    ## PYTORCH CODE
-    from transformers import AutoModelWithLMHead, AutoTokenizer
-    import torch
-
-    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
-    model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")
-
-    sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
-
-    input = tokenizer.encode(sequence, return_tensors="pt")
-    mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
-
-    token_logits = model(input)[0]
-    mask_token_logits = token_logits[0, mask_token_index, :]
-
-    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
-
-    for token in top_5_tokens:
-        print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
-    ## TENSORFLOW CODE
-    from transformers import TFAutoModelWithLMHead, AutoTokenizer
-    import tensorflow as tf
-
-    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
-    model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased")
-
-    sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
-
-    input = tokenizer.encode(sequence, return_tensors="tf")
-    mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1]
-
-    token_logits = model(input)[0]
-    mask_token_logits = token_logits[0, mask_token_index, :]
-
-    top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
-
-    for token in top_5_tokens:
-        print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
-
-This prints five sequences, with the top 5 tokens predicted by the model:
-
-::
-
-    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint.
-    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint.
-    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
-    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
-    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
-
-
-Causal Language Modeling
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the
-model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting
-for generation tasks.
-
-There is currently no pipeline to do causal language modeling/generation.
-
-Here is an example using the tokenizer and model. leveraging the :func:`~transformers.PreTrainedModel.generate` method
-to generate the tokens following the initial sequence in PyTorch, and creating a simple loop in TensorFlow.
-
-::
-
-    ## PYTORCH CODE
-    from transformers import AutoModelWithLMHead, AutoTokenizer
-
-    tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    model = AutoModelWithLMHead.from_pretrained("gpt2")
-
-    sequence = f"Hugging Face is based in DUMBO, New York City, and is"
-
-    input = tokenizer.encode(sequence, return_tensors="pt")
-    generated = model.generate(input, max_length=50)
-
-    resulting_string = tokenizer.decode(generated.tolist()[0])
-    print(resulting_string)
-    ## TENSORFLOW CODE
-    from transformers import TFAutoModelWithLMHead, AutoTokenizer
-    import tensorflow as tf
-
-    tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    model = TFAutoModelWithLMHead.from_pretrained("gpt2")
-
-    sequence = f"Hugging Face is based in DUMBO, New York City, and is"
-    generated = tokenizer.encode(sequence)
-
-    for i in range(50):
-        predictions = model(tf.constant([generated]))[0]
-        token = tf.argmax(predictions[0], axis=1)[-1].numpy()
-        generated += [token]
-
-    resulting_string = tokenizer.decode(generated)
-    print(resulting_string)
-
-
-This outputs a (hopefully) coherent string from the original sequence, as the
-:func:`~transformers.PreTrainedModel.generate` samples from a top_p/tok_k distribution:
-
-::
-
-    Hugging Face is based in DUMBO, New York City, and is a live-action TV series based on the novel by John
-    Carpenter, and its producers, David Kustlin and Steve Pichar. The film is directed by!
-
-
-Named Entity Recognition
----------------------------------------------------
-
-Named Entity Recognition (NER) is the task of classifying tokens according to a class, for example identifying a
-token as a person, an organisation or a location.
-An example of a named entity recognition dataset is the CoNLL-2003 dataset, which is entirely based on that task.
-If you would like to fine-tune a model on an NER task, you may leverage the `ner/run_ner.py` (PyTorch),
-`ner/run_pl_ner.py` (leveraging pytorch-lightning) or the `ner/run_tf_ner.py` (TensorFlow) scripts.
-
-Here is an example using the pipelines do to named entity recognition, trying to identify tokens as belonging to one
-of 9 classes:
-
- O, Outside of a named entity
- B-MIS, Beginning of a miscellaneous entity right after another miscellaneous entity
- I-MIS, Miscellaneous entity
- B-PER, Beginning of a person's name right after another person's name
- I-PER, Person's name
- B-ORG, Beginning of an organisation right after another organisation
- I-ORG, Organisation
- B-LOC, Beginning of a location right after another location
- I-LOC, Location
-
-It leverages a fine-tuned model on CoNLL-2003, fine-tuned by `@stefan-it <https://github.com/stefan-it>`__ from
-`dbmdz <https://github.com/dbmdz>`__.
-
-::
-
-    from transformers import pipeline
-
-    nlp = pipeline("ner")
-
-    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
-               "close to the Manhattan Bridge which is visible from the window."
-
-    print(nlp(sequence))
-
-This outputs a list of all words that have been identified as an entity from the 9 classes defined above. Here is the
-expected results:
-
-::
-
-    [
-        {'word': 'Hu', 'score': 0.9995632767677307, 'entity': 'I-ORG'},
-        {'word': '##gging', 'score': 0.9915938973426819, 'entity': 'I-ORG'},
-        {'word': 'Face', 'score': 0.9982671737670898, 'entity': 'I-ORG'},
-        {'word': 'Inc', 'score': 0.9994403719902039, 'entity': 'I-ORG'},
-        {'word': 'New', 'score': 0.9994346499443054, 'entity': 'I-LOC'},
-        {'word': 'York', 'score': 0.9993270635604858, 'entity': 'I-LOC'},
-        {'word': 'City', 'score': 0.9993864893913269, 'entity': 'I-LOC'},
-        {'word': 'D', 'score': 0.9825621843338013, 'entity': 'I-LOC'},
-        {'word': '##UM', 'score': 0.936983048915863, 'entity': 'I-LOC'},
-        {'word': '##BO', 'score': 0.8987102508544922, 'entity': 'I-LOC'},
-        {'word': 'Manhattan', 'score': 0.9758241176605225, 'entity': 'I-LOC'},
-        {'word': 'Bridge', 'score': 0.990249514579773, 'entity': 'I-LOC'}
-    ]
-
-Note how the words "Hugging Face" have been identified as an organisation, and "New York City", "DUMBO" and
-"Manhattan Bridge" have been identified as locations.
-
-Here is an example doing named entity recognition using a model and a tokenizer. The process is the following:
-
- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and
-  loads it with the weights stored in the checkpoint.
- Define the label list with which the model was trained on.
- Define a sequence with known entities, such as "Hugging Face" as an organisation and "New York City" as a location.
- Split words into tokens so that they can be mapped to the predictions. We use a small hack by firstly completely
-  encoding and decoding the sequence, so that we're left with a string that contains the special tokens.
- Encode that sequence into IDs (special tokens are added automatically).
- Retrieve the predictions by passing the input to the model and getting the first output. This results in a
-  distribution over the 9 possible classes for each token. We take the argmax to retrieve the most likely class
-  for each token.
- Zip together each token with its prediction and print it.
-
-::
-
-    ## PYTORCH CODE
-    from transformers import AutoModelForTokenClassification, AutoTokenizer
-    import torch
-
-    model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-
-    label_list = [
-        "O",       # Outside of a named entity
-        "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
-        "I-MISC",  # Miscellaneous entity
-        "B-PER",   # Beginning of a person's name right after another person's name
-        "I-PER",   # Person's name
-        "B-ORG",   # Beginning of an organisation right after another organisation
-        "I-ORG",   # Organisation
-        "B-LOC",   # Beginning of a location right after another location
-        "I-LOC"    # Location
-    ]
-
-    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
-               "close to the Manhattan Bridge."
-
-    # Bit of a hack to get the tokens with the special tokens
-    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
-    inputs = tokenizer.encode(sequence, return_tensors="pt")
-
-    outputs = model(inputs)[0]
-    predictions = torch.argmax(outputs, dim=2)
-
-    print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].tolist())])
-    ## TENSORFLOW CODE
-    from transformers import TFAutoModelForTokenClassification, AutoTokenizer
-    import tensorflow as tf
-
-    model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-
-    label_list = [
-        "O",       # Outside of a named entity
-        "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
-        "I-MISC",  # Miscellaneous entity
-        "B-PER",   # Beginning of a person's name right after another person's name
-        "I-PER",   # Person's name
-        "B-ORG",   # Beginning of an organisation right after another organisation
-        "I-ORG",   # Organisation
-        "B-LOC",   # Beginning of a location right after another location
-        "I-LOC"    # Location
-    ]
-
-    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
-               "close to the Manhattan Bridge."
-
-    # Bit of a hack to get the tokens with the special tokens
-    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
-    inputs = tokenizer.encode(sequence, return_tensors="tf")
-
-    outputs = model(inputs)[0]
-    predictions = tf.argmax(outputs, axis=2)
-
-    print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())])
-
-This outputs a list of each token mapped to their prediction. Differently from the pipeline, here every token has
-a prediction as we didn't remove the "0" class which means that no particular entity was found on that token. The
-following array should be the output:
-
-::
-
-    [('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,10 +1,41 @@
-# Examples
+## Examples

-In this section a few examples are put together. All of these examples work for several models, making use of the very
-similar API between the different models.
+Version 2.9 of 🤗 Transformers introduces a new [`Trainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py) class for PyTorch, and its equivalent [`TFTrainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer_tf.py) for TF 2.
+Running the examples requires PyTorch 1.3.1+ or TensorFlow 2.1+.
+
+Here is the list of all our examples:
+- **grouped by task** (all official examples work for multiple models)
+- with information on whether they are **built on top of `Trainer`/`TFTrainer`** (if not, they still work, they might just lack some features),
+- whether they also include examples for **`pytorch-lightning`**, which is a great fully-featured, general-purpose training library for PyTorch,
+- links to **Colab notebooks** to walk through the scripts and run them easily,
+- links to **Cloud deployments** to be able to deploy large-scale trainings in the Cloud with little to no setup.
+
+This is still a work-in-progress – in particular documentation is still sparse – so please **contribute improvements/pull requests.**
+
+
+# The Big Table of Tasks
+
+| Task | Example datasets | Trainer support | TFTrainer support | pytorch-lightning | Colab
+|---|---|:---:|:---:|:---:|:---:|
+| [**`language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/language-modeling)       | Raw text        | ✅ | -  | -  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)
+| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/text-classification)   | GLUE, XNLI      | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/trainer/01_text_classification.ipynb)
+| [**`token-classification`**](https://github.com/huggingface/transformers/tree/master/examples/token-classification) | CoNLL NER       | ✅ | ✅ | ✅ | -
+| [**`multiple-choice`**](https://github.com/huggingface/transformers/tree/master/examples/multiple-choice)           | SWAG, RACE, ARC | ✅ | ✅ | -  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ViktorAlm/notebooks/blob/master/MPC_GPU_Demo_for_TF_and_PT.ipynb)
+| [**`question-answering`**](https://github.com/huggingface/transformers/tree/master/examples/question-answering)     | SQuAD           | -  | ✅ | -  | -
+| [**`text-generation`**](https://github.com/huggingface/transformers/tree/master/examples/text-generation)           | -               | n/a | n/a | n/a | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)
+| [**`distillation`**](https://github.com/huggingface/transformers/tree/master/examples/distillation)       | All               | -  | -  | -  | -
+| [**`summarization`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq)     | CNN/Daily Mail    | -  | -  | ✅  | -
+| [**`translation`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq)         | WMT               | -  | -  | ✅  | -
+| [**`bertology`**](https://github.com/huggingface/transformers/tree/master/examples/bertology)             | -                 | -  | -  | -  | -
+| [**`adversarial`**](https://github.com/huggingface/transformers/tree/master/examples/adversarial)         | HANS              | ✅ | -  | -  | -
+
+
+<br>
+
+## Important note

 **Important**
-To run the latest versions of the examples, you have to install from source and install some specific requirements for the examples.
+To make sure you can successfully run the latest versions of the example scripts, you have to install the library from source and install some example-specific requirements.
 Execute the following steps in a new virtual environment:

 ```bash
@@ -14,608 +45,36 @@ pip install .
 pip install -r ./examples/requirements.txt
 ```

-| Section                    | Description                                                                                                                                                |
-|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------
-| [TensorFlow 2.0 models on GLUE](#TensorFlow-2.0-Bert-models-on-GLUE) | Examples running BERT TensorFlow 2.0 model on the GLUE tasks. |
-| [Language Model training](#language-model-training) | Fine-tuning (or training from scratch) the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. |
-| [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet. |
-| [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision. |
-| [SQuAD](#squad) | Using BERT/RoBERTa/XLNet/XLM for question answering, examples with distributed training. |
-| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. |
-| [Named Entity Recognition](https://github.com/huggingface/transformers/tree/master/examples/ner) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training. |
-| [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
-| [Adversarial evaluation of model performances](#adversarial-evaluation-of-model-performances) | Testing a model with adversarial evaluation of natural language inference on the Heuristic Analysis for NLI Systems (HANS) dataset (McCoy et al., 2019.) |
+## One-click Deploy to Cloud (wip)

-## TensorFlow 2.0 Bert models on GLUE
+#### Azure

-Based on the script [`run_tf_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_glue.py).
+[![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2FAzure%2Fazure-quickstart-templates%2Fmaster%2F101-storage-account-create%2Fazuredeploy.json)

-Fine-tuning the library TensorFlow 2.0 Bert model for sequence classification on the  MRPC task of the GLUE benchmark: [General Language Understanding Evaluation](https://gluebenchmark.com/).
+## Running on TPUs

-This script has an option for mixed precision (Automatic Mixed Precision / AMP) to run models on Tensor Cores (NVIDIA Volta/Turing GPUs) and future hardware and an option for XLA, which uses the XLA compiler to reduce model runtime.
-Options are toggled using `USE_XLA` or `USE_AMP` variables in the script.
-These options and the below benchmark are provided by @tlkh.
+When using Tensorflow, TPUs are supported out of the box as a `tf.distribute.Strategy`.

-Quick benchmarks from the script (no other modifications):
+When using PyTorch, we support TPUs thanks to `pytorch/xla`. For more context and information on how to setup your TPU environment refer to Google's documentation and to the
+very detailed [pytorch/xla README](https://github.com/pytorch/xla/blob/master/README.md).

-| GPU    | Mode | Time (2nd epoch) | Val Acc (3 runs) |
-| --------- | -------- | ----------------------- | ----------------------|
-| Titan V | FP32 | 41s | 0.8438/0.8281/0.8333 |
-| Titan V | AMP | 26s | 0.8281/0.8568/0.8411 |
-| V100    | FP32 | 35s | 0.8646/0.8359/0.8464 |
-| V100    | AMP | 22s | 0.8646/0.8385/0.8411 |
-| 1080 Ti | FP32 | 55s | - |
+In this repo, we provide a very simple launcher script named [xla_spawn.py](https://github.com/huggingface/transformers/tree/master/examples/xla_spawn.py) that lets you run our example scripts on multiple TPU cores without any boilerplate.
+Just pass a `--num_cores` flag to this script, then your regular training script with its arguments (this is similar to the `torch.distributed.launch` helper for torch.distributed).

-Mixed precision (AMP) reduces the training time considerably for the same hardware and hyper-parameters (same batch size was used).
-
-## Language model training
-
-Based on the script [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/run_language_modeling.py).
-
-Fine-tuning (or training from scratch) the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT 
-to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa 
-are fine-tuned using a masked language modeling (MLM) loss.
-
-Before running the following example, you should get a file that contains text on which the language model will be
-trained or fine-tuned. A good example of such text is the [WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
-
-We will refer to two different files: `$TRAIN_FILE`, which contains text for training, and `$TEST_FILE`, which contains
-text that will be used for evaluation.
-
-### GPT-2/GPT and causal language modeling
-
-The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before
-the tokenization). The loss here is that of causal language modeling.
+For example for `run_glue`:

 ```bash
-export TRAIN_FILE=/path/to/dataset/wiki.train.raw
-export TEST_FILE=/path/to/dataset/wiki.test.raw
-
-python run_language_modeling.py \
-    --output_dir=output \
-    --model_type=gpt2 \
-    --model_name_or_path=gpt2 \
-    --do_train \
-    --train_data_file=$TRAIN_FILE \
-    --do_eval \
-    --eval_data_file=$TEST_FILE
+python examples/xla_spawn.py --num_cores 8 \
+	examples/text-classification/run_glue.py
+	--model_name_or_path bert-base-cased \
+	--task_name mnli \
+	--data_dir ./data/glue_data/MNLI \
+	--output_dir ./models/tpu \
+	--overwrite_output_dir \
+	--do_train \
+	--do_eval \
+	--num_train_epochs 1 \
+	--save_steps 20000
 ```

-This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run. It reaches
-a score of ~20 perplexity once fine-tuned on the dataset.
-
-### RoBERTa/BERT and masked language modeling
-
-The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different
-as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their
-pre-training: masked language modeling.
-
-In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may, therefore, converge
-slightly slower (over-fitting takes more epochs).
-
-We use the `--mlm` flag so that the script may change its loss function.
-
-```bash
-export TRAIN_FILE=/path/to/dataset/wiki.train.raw
-export TEST_FILE=/path/to/dataset/wiki.test.raw
-
-python run_language_modeling.py \
-    --output_dir=output \
-    --model_type=roberta \
-    --model_name_or_path=roberta-base \
-    --do_train \
-    --train_data_file=$TRAIN_FILE \
-    --do_eval \
-    --eval_data_file=$TEST_FILE \
-    --mlm
-```
-
-## Language generation
-
-Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py).
-
-Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL.
-A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
-can try out the different models available in the library.
-
-Example usage:
-
-```bash
-python run_generation.py \
-    --model_type=gpt2 \
-    --model_name_or_path=gpt2
-```
-
-## GLUE
-
-Based on the script [`run_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py).
-
-Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding
-Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa.
-
-GLUE is made up of a total of 9 different tasks. We get the following results on the dev set of the benchmark with an
-uncased  BERT base model (the checkpoint `bert-base-uncased`). All experiments ran single V100 GPUs with a total train
-batch sizes between 16 and 64. Some of these tasks have a small dataset and training can lead to high variance in the results
-between different runs. We report the median on 5 runs (with different seeds) for each of the metrics.
-
-| Task  | Metric                       | Result      |
-|-------|------------------------------|-------------|
-| CoLA  | Matthew's corr               | 49.23       |
-| SST-2 | Accuracy                     | 91.97       |
-| MRPC  | F1/Accuracy                  | 89.47/85.29 |
-| STS-B | Person/Spearman corr.        | 83.95/83.70 |
-| QQP   | Accuracy/F1                  | 88.40/84.31 |
-| MNLI  | Matched acc./Mismatched acc. | 80.61/81.08 |
-| QNLI  | Accuracy                     | 87.46       |
-| RTE   | Accuracy                     | 61.73       |
-| WNLI  | Accuracy                     | 45.07       |
-
-Some of these results are significantly different from the ones reported on the test set
-of GLUE benchmark on the website. For QQP and WNLI, please refer to [FAQ #12](https://gluebenchmark.com/faq) on the webite.
-
-Before running any one of these GLUE tasks you should download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
-
-```bash
-export GLUE_DIR=/path/to/glue
-export TASK_NAME=MRPC
-
-python run_glue.py \
-  --model_type bert \
-  --model_name_or_path bert-base-cased \
-  --task_name $TASK_NAME \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $GLUE_DIR/$TASK_NAME \
-  --max_seq_length 128 \
-  --per_gpu_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --output_dir /tmp/$TASK_NAME/
-```
-
-where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
-
-The dev set results will be present within the text file `eval_results.txt` in the specified output_dir.
-In case of MNLI, since there are two separate dev sets (matched and mismatched), there will be a separate
-output folder called `/tmp/MNLI-MM/` in addition to `/tmp/MNLI/`.
-
-The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI,
-CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being
-said, there shouldn’t be any issues in running half-precision training with the remaining GLUE tasks as well,
-since the data processor for each task inherits from the base class DataProcessor.
-
-### MRPC
-
-#### Fine-tuning example
-
-The following examples fine-tune BERT on the Microsoft Research Paraphrase Corpus (MRPC) corpus and runs in less
-than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
-
-Before running any one of these GLUE tasks you should download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
-
-```bash
-export GLUE_DIR=/path/to/glue
-
-python run_glue.py \
-  --model_type bert \
-  --model_name_or_path bert-base-cased \
-  --task_name MRPC \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $GLUE_DIR/MRPC/ \
-  --max_seq_length 128 \
-  --per_gpu_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --output_dir /tmp/mrpc_output/
-```
-
-Our test ran on a few seeds with [the original implementation hyper-
-parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation
-results between 84% and 88%.
-
-#### Using Apex and mixed-precision
-
-Using Apex and 16 bit precision, the fine-tuning on MRPC only takes 27 seconds. First install
-[apex](https://github.com/NVIDIA/apex), then run the following example:
-
-```bash
-export GLUE_DIR=/path/to/glue
-
-python run_glue.py \
-  --model_type bert \
-  --model_name_or_path bert-base-cased \
-  --task_name MRPC \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $GLUE_DIR/MRPC/ \
-  --max_seq_length 128 \
-  --per_gpu_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --output_dir /tmp/mrpc_output/ \
-  --fp16
-```
-
-#### Distributed training
-
-Here is an example using distributed training on 8 V100 GPUs. The model used is the BERT whole-word-masking and it
-reaches F1 > 92 on MRPC.
-
-```bash
-export GLUE_DIR=/path/to/glue
-
-python -m torch.distributed.launch \
-    --nproc_per_node 8 run_glue.py \
-    --model_type bert \
-    --model_name_or_path bert-base-cased \
-    --task_name MRPC \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --data_dir $GLUE_DIR/MRPC/ \
-    --max_seq_length 128 \
-    --per_gpu_train_batch_size 8 \
-    --learning_rate 2e-5 \
-    --num_train_epochs 3.0 \
-    --output_dir /tmp/mrpc_output/
-```
-
-Training with these hyper-parameters gave us the following results:
-
-```bash
-acc = 0.8823529411764706
-acc_and_f1 = 0.901702786377709
-eval_loss = 0.3418912578906332
-f1 = 0.9210526315789473
-global_step = 174
-loss = 0.07231863956341798
-```
-
-### MNLI
-
-The following example uses the BERT-large, uncased, whole-word-masking model and fine-tunes it on the MNLI task.
-
-```bash
-export GLUE_DIR=/path/to/glue
-
-python -m torch.distributed.launch \
-    --nproc_per_node 8 run_glue.py \
-    --model_type bert \
-    --model_name_or_path bert-base-cased \
-    --task_name mnli \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --data_dir $GLUE_DIR/MNLI/ \
-    --max_seq_length 128 \
-    --per_gpu_train_batch_size 8 \
-    --learning_rate 2e-5 \
-    --num_train_epochs 3.0 \
-    --output_dir output_dir \
-```
-
-The results  are the following:
-
-```bash
-***** Eval results *****
-  acc = 0.8679706601466992
-  eval_loss = 0.4911287787382479
-  global_step = 18408
-  loss = 0.04755385363816904
-
-***** Eval results *****
-  acc = 0.8747965825874695
-  eval_loss = 0.45516540421714036
-  global_step = 18408
-  loss = 0.04755385363816904
-```
-
-## Multiple Choice
-
-Based on the script [`run_multiple_choice.py`]().
-
-#### Fine-tuning on SWAG
-Download [swag](https://github.com/rowanz/swagaf/tree/master/data) data
-
-```bash
-#training on 4 tesla V100(16GB) GPUS
-export SWAG_DIR=/path/to/swag_data_dir
-python ./examples/run_multiple_choice.py \
--model_type roberta \
--task_name swag \
--model_name_or_path roberta-base \
--do_train \
--do_eval \
--do_lower_case \
--data_dir $SWAG_DIR \
--learning_rate 5e-5 \
--num_train_epochs 3 \
--max_seq_length 80 \
--output_dir models_bert/swag_base \
--per_gpu_eval_batch_size=16 \
--per_gpu_train_batch_size=16 \
--gradient_accumulation_steps 2 \
--overwrite_output
-```
-Training with the defined hyper-parameters yields the following results:
-```
-***** Eval results *****
-eval_acc = 0.8338998300509847
-eval_loss = 0.44457291918821606
-```
-
-## SQuAD
-
-Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py).
-
-#### Fine-tuning BERT on SQuAD1.0
-
-This example code fine-tunes BERT on the SQuAD1.0 dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large)
-on a single tesla V100 16GB. The data for SQuAD can be downloaded with the following links and should be saved in a
-$SQUAD_DIR directory.
-
-* [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
-* [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
-* [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
-
-And for SQuAD2.0, you need to download:
-
- [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
- [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
- [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
-
-```bash
-export SQUAD_DIR=/path/to/SQUAD
-
-python run_squad.py \
-  --model_type bert \
-  --model_name_or_path bert-base-uncased \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --train_file $SQUAD_DIR/train-v1.1.json \
-  --predict_file $SQUAD_DIR/dev-v1.1.json \
-  --per_gpu_train_batch_size 12 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2.0 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/debug_squad/
-```
-
-Training with the previously defined hyper-parameters yields the following results:
-
-```bash
-f1 = 88.52
-exact_match = 81.22
-```
-
-#### Distributed training
-
-
-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \
-    --model_type bert \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --train_file $SQUAD_DIR/train-v1.1.json \
-    --predict_file $SQUAD_DIR/dev-v1.1.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ./examples/models/wwm_uncased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=3   \
-    --per_gpu_train_batch_size=3   \
-```
-
-Training with the previously defined hyper-parameters yields the following results:
-
-```bash
-f1 = 93.15
-exact_match = 86.91
-```
-
-This fine-tuned model is available as a checkpoint under the reference
-`bert-large-uncased-whole-word-masking-finetuned-squad`.
-
-#### Fine-tuning XLNet on SQuAD
-
-This example code fine-tunes XLNet on both SQuAD1.0 and SQuAD2.0 dataset. See above to download the data for SQuAD .
-
-##### Command for SQuAD1.0:
-
-```bash
-export SQUAD_DIR=/path/to/SQUAD
-
-python run_squad.py \
-    --model_type xlnet \
-    --model_name_or_path xlnet-large-cased \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --train_file $SQUAD_DIR/train-v1.1.json \
-    --predict_file $SQUAD_DIR/dev-v1.1.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ./wwm_cased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=4  \
-    --per_gpu_train_batch_size=4   \
-    --save_steps 5000
-```
-
-##### Command for SQuAD2.0:
-
-```bash
-export SQUAD_DIR=/path/to/SQUAD
-
-python run_squad.py \
-    --model_type xlnet \
-    --model_name_or_path xlnet-large-cased \
-    --do_train \
-    --do_eval \
-    --version_2_with_negative \
-    --train_file $SQUAD_DIR/train-v2.0.json \
-    --predict_file $SQUAD_DIR/dev-v2.0.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 4 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ./wwm_cased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=2  \
-    --per_gpu_train_batch_size=2   \
-    --save_steps 5000
-```
-
-Larger batch size may improve the performance while costing more memory.
-
-##### Results for SQuAD1.0 with the previously defined hyper-parameters:
-
-```python
-{
-"exact": 85.45884578997162,
-"f1": 92.5974600601065,
-"total": 10570,
-"HasAns_exact": 85.45884578997162,
-"HasAns_f1": 92.59746006010651,
-"HasAns_total": 10570
-}
-```
-
-##### Results for SQuAD2.0 with the previously defined hyper-parameters:
-
-```python
-{
-"exact": 80.4177545691906,
-"f1": 84.07154997729623,
-"total": 11873,
-"HasAns_exact": 76.73751686909581,
-"HasAns_f1": 84.05558584352873,
-"HasAns_total": 5928,
-"NoAns_exact": 84.0874684608915,
-"NoAns_f1": 84.0874684608915,
-"NoAns_total": 5945
-}
-```
-
-
-
-
-## XNLI
-
-Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/blob/master/examples/run_xnli.py).
-
-[XNLI](https://www.nyu.edu/projects/bowman/xnli/) is crowd-sourced dataset based on [MultiNLI](http://www.nyu.edu/projects/bowman/multinli/). It is an evaluation benchmark for cross-lingual text representations. Pairs of text are labeled with textual entailment annotations for 15 different languages (including both high-resource language such as English and low-resource languages such as Swahili).
-
-#### Fine-tuning on XNLI
-
-This example code fine-tunes mBERT (multi-lingual BERT) on the XNLI dataset. It runs in 106 mins
-on a single tesla V100 16GB. The data for XNLI can be downloaded with the following links and should be both saved (and un-zipped) in a
-`$XNLI_DIR` directory.
-
-* [XNLI 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-1.0.zip)
-* [XNLI-MT 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-MT-1.0.zip)
-
-```bash
-export XNLI_DIR=/path/to/XNLI
-
-python run_xnli.py \
-  --model_type bert \
-  --model_name_or_path bert-base-multilingual-cased \
-  --language de \
-  --train_language en \
-  --do_train \
-  --do_eval \
-  --data_dir $XNLI_DIR \
-  --per_gpu_train_batch_size 32 \
-  --learning_rate 5e-5 \
-  --num_train_epochs 2.0 \
-  --max_seq_length 128 \
-  --output_dir /tmp/debug_xnli/ \
-  --save_steps -1
-```
-
-Training with the previously defined hyper-parameters yields the following results on the **test** set:
-
-```bash
-acc = 0.7093812375249501
-```
-
-## MM-IMDb
-
-Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/mm-imdb/run_mmimdb.py).
-
-[MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata.
-
-### Training on MM-IMDb
-
-```
-python run_mmimdb.py \
-    --data_dir /path/to/mmimdb/dataset/ \
-    --model_type bert \
-    --model_name_or_path bert-base-uncased \
-    --output_dir /path/to/save/dir/ \
-    --do_train \
-    --do_eval \
-    --max_seq_len 512 \
-    --gradient_accumulation_steps 20 \
-    --num_image_embeds 3 \
-    --num_train_epochs 100 \
-    --patience 5
-```
-
-## Adversarial evaluation of model performances
-
-Here is an example on evaluating a model using adversarial evaluation of natural language inference with the Heuristic Analysis for NLI Systems (HANS) dataset [McCoy et al., 2019](https://arxiv.org/abs/1902.01007). The example was gracefully provided by [Nafise Sadat Moosavi](https://github.com/ns-moosavi).
-
-The HANS dataset can be downloaded from [this location](https://github.com/tommccoy1/hans).
-
-This is an example of using test_hans.py:
-
-```bash
-export HANS_DIR=path-to-hans
-export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc
-export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py
-
-python examples/hans/test_hans.py \
-        --task_name hans \
-        --model_type $MODEL_TYPE \
-        --do_eval \
-        --do_lower_case \
-        --data_dir $HANS_DIR \
-        --model_name_or_path $MODEL_PATH \
-        --max_seq_length 128 \
-        --output_dir $MODEL_PATH \
-```
-
-This will create the hans_predictions.txt file in MODEL_PATH, which can then be evaluated using hans/evaluate_heur_output.py from the HANS dataset.
-
-The results of the BERT-base model that is trained on MNLI using batch size 8 and the random seed 42 on the HANS dataset is as follows:
-
-```bash
-Heuristic entailed results:
-lexical_overlap: 0.9702
-subsequence: 0.9942
-constituent: 0.9962
-
-Heuristic non-entailed results:
-lexical_overlap: 0.199
-subsequence: 0.0396
-constituent: 0.118
-```
+Feedback and more use cases and benchmarks involving TPUs are welcome, please share with the community.
--- a/examples/adversarial/README.md
+++ b/examples/adversarial/README.md
@@ -0,0 +1,38 @@
+## Adversarial evaluation of model performances
+
+Here is an example on evaluating a model using adversarial evaluation of natural language inference with the Heuristic Analysis for NLI Systems (HANS) dataset [McCoy et al., 2019](https://arxiv.org/abs/1902.01007). The example was gracefully provided by [Nafise Sadat Moosavi](https://github.com/ns-moosavi).
+
+The HANS dataset can be downloaded from [this location](https://github.com/tommccoy1/hans).
+
+This is an example of using test_hans.py:
+
+```bash
+export HANS_DIR=path-to-hans
+export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc
+export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py
+
+python run_hans.py \
+        --task_name hans \
+        --model_type $MODEL_TYPE \
+        --do_eval \
+        --data_dir $HANS_DIR \
+        --model_name_or_path $MODEL_PATH \
+        --max_seq_length 128 \
+        --output_dir $MODEL_PATH \
+```
+
+This will create the hans_predictions.txt file in MODEL_PATH, which can then be evaluated using hans/evaluate_heur_output.py from the HANS dataset.
+
+The results of the BERT-base model that is trained on MNLI using batch size 8 and the random seed 42 on the HANS dataset is as follows:
+
+```bash
+Heuristic entailed results:
+lexical_overlap: 0.9702
+subsequence: 0.9942
+constituent: 0.9962
+
+Heuristic non-entailed results:
+lexical_overlap: 0.199
+subsequence: 0.0396
+constituent: 0.118
+```
--- a/examples/adversarial/run_hans.py
+++ b/examples/adversarial/run_hans.py
@@ -0,0 +1,231 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for sequence classification on HANS."""
+
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+import numpy as np
+import torch
+
+from transformers import (
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    default_data_collator,
+    set_seed,
+)
+from utils_hans import HansDataset, InputFeatures, hans_processors, hans_tasks_num_labels
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    task_name: str = field(
+        metadata={"help": "The name of the task to train selected in the list: " + ", ".join(hans_processors.keys())}
+    )
+    data_dir: str = field(
+        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
+    )
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+
+
+def hans_data_collator(features: List[InputFeatures]) -> Dict[str, torch.Tensor]:
+    """
+    Data collator that removes the "pairID" key if present.
+    """
+    batch = default_data_collator(features)
+    _ = batch.pop("pairID", None)
+    return batch
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
+        )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        training_args.local_rank,
+        training_args.device,
+        training_args.n_gpu,
+        bool(training_args.local_rank != -1),
+        training_args.fp16,
+    )
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed
+    set_seed(training_args.seed)
+
+    try:
+        num_labels = hans_tasks_num_labels[data_args.task_name]
+    except KeyError:
+        raise ValueError("Task not found: %s" % (data_args.task_name))
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=data_args.task_name,
+        cache_dir=model_args.cache_dir,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+    )
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+    )
+
+    # Get datasets
+    train_dataset = (
+        HansDataset(
+            data_dir=data_args.data_dir,
+            tokenizer=tokenizer,
+            task=data_args.task_name,
+            max_seq_length=data_args.max_seq_length,
+            overwrite_cache=data_args.overwrite_cache,
+        )
+        if training_args.do_train
+        else None
+    )
+    eval_dataset = (
+        HansDataset(
+            data_dir=data_args.data_dir,
+            tokenizer=tokenizer,
+            task=data_args.task_name,
+            max_seq_length=data_args.max_seq_length,
+            overwrite_cache=data_args.overwrite_cache,
+            evaluate=True,
+        )
+        if training_args.do_eval
+        else None
+    )
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        data_collator=hans_data_collator,
+    )
+
+    # Training
+    if training_args.do_train:
+        trainer.train(
+            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
+        )
+        trainer.save_model()
+        # For convenience, we also re-save the tokenizer to the same directory,
+        # so that you can share your model easily on huggingface.co/models =)
+        if trainer.is_world_master():
+            tokenizer.save_pretrained(training_args.output_dir)
+
+    # Evaluation
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        output = trainer.predict(eval_dataset)
+        preds = output.predictions
+        preds = np.argmax(preds, axis=1)
+
+        pair_ids = [ex.pairID for ex in eval_dataset]
+        output_eval_file = os.path.join(training_args.output_dir, "hans_predictions.txt")
+        label_list = eval_dataset.get_labels()
+        if trainer.is_world_master():
+            with open(output_eval_file, "w") as writer:
+                writer.write("pairID,gold_label\n")
+                for pid, pred in zip(pair_ids, preds):
+                    writer.write("ex" + str(pid) + "," + label_list[int(pred)] + "\n")
+
+        trainer._log(output.metrics)
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/adversarial/utils_hans.py
+++ b/examples/adversarial/utils_hans.py
@@ -0,0 +1,331 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import tqdm
+from filelock import FileLock
+
+from transformers import (
+    BartTokenizer,
+    BartTokenizerFast,
+    DataProcessor,
+    PreTrainedTokenizer,
+    RobertaTokenizer,
+    RobertaTokenizerFast,
+    XLMRobertaTokenizer,
+    is_tf_available,
+    is_torch_available,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class InputExample:
+    """
+    A single training/test example for simple sequence classification.
+
+    Args:
+        guid: Unique id for the example.
+        text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+        text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+        label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        pairID: (Optional) string. Unique identifier for the pair of sentences.
+    """
+
+    guid: str
+    text_a: str
+    text_b: Optional[str] = None
+    label: Optional[str] = None
+    pairID: Optional[str] = None
+
+
+@dataclass(frozen=True)
+class InputFeatures:
+    """
+    A single set of features of data.
+    Property names are the same names as the corresponding inputs to a model.
+
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
+        token_type_ids: (Optional) Segment token indices to indicate first and second
+            portions of the inputs. Only some models use them.
+        label: (Optional) Label corresponding to the input. Int for classification problems,
+            float for regression problems.
+        pairID: (Optional) Unique identifier for the pair of sentences.
+    """
+
+    input_ids: List[int]
+    attention_mask: Optional[List[int]] = None
+    token_type_ids: Optional[List[int]] = None
+    label: Optional[Union[int, float]] = None
+    pairID: Optional[int] = None
+
+
+if is_torch_available():
+    import torch
+    from torch.utils.data.dataset import Dataset
+
+    class HansDataset(Dataset):
+        """
+        This will be superseded by a framework-agnostic approach
+        soon.
+        """
+
+        features: List[InputFeatures]
+
+        def __init__(
+            self,
+            data_dir: str,
+            tokenizer: PreTrainedTokenizer,
+            task: str,
+            max_seq_length: Optional[int] = None,
+            overwrite_cache=False,
+            evaluate: bool = False,
+        ):
+            processor = hans_processors[task]()
+
+            cached_features_file = os.path.join(
+                data_dir,
+                "cached_{}_{}_{}_{}".format(
+                    "dev" if evaluate else "train", tokenizer.__class__.__name__, str(max_seq_length), task,
+                ),
+            )
+            label_list = processor.get_labels()
+            if tokenizer.__class__ in (
+                RobertaTokenizer,
+                RobertaTokenizerFast,
+                XLMRobertaTokenizer,
+                BartTokenizer,
+                BartTokenizerFast,
+            ):
+                # HACK(label indices are swapped in RoBERTa pretrained model)
+                label_list[1], label_list[2] = label_list[2], label_list[1]
+            self.label_list = label_list
+
+            # Make sure only the first process in distributed training processes the dataset,
+            # and the others will use the cache.
+            lock_path = cached_features_file + ".lock"
+            with FileLock(lock_path):
+
+                if os.path.exists(cached_features_file) and not overwrite_cache:
+                    logger.info(f"Loading features from cached file {cached_features_file}")
+                    self.features = torch.load(cached_features_file)
+                else:
+                    logger.info(f"Creating features from dataset file at {data_dir}")
+
+                    examples = (
+                        processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
+                    )
+
+                    logger.info("Training examples: %s", len(examples))
+                    self.features = hans_convert_examples_to_features(examples, label_list, max_seq_length, tokenizer)
+                    logger.info("Saving features into cached file %s", cached_features_file)
+                    torch.save(self.features, cached_features_file)
+
+        def __len__(self):
+            return len(self.features)
+
+        def __getitem__(self, i) -> InputFeatures:
+            return self.features[i]
+
+        def get_labels(self):
+            return self.label_list
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    class TFHansDataset:
+        """
+        This will be superseded by a framework-agnostic approach
+        soon.
+        """
+
+        features: List[InputFeatures]
+
+        def __init__(
+            self,
+            data_dir: str,
+            tokenizer: PreTrainedTokenizer,
+            task: str,
+            max_seq_length: Optional[int] = 128,
+            overwrite_cache=False,
+            evaluate: bool = False,
+        ):
+            processor = hans_processors[task]()
+            label_list = processor.get_labels()
+            if tokenizer.__class__ in (
+                RobertaTokenizer,
+                RobertaTokenizerFast,
+                XLMRobertaTokenizer,
+                BartTokenizer,
+                BartTokenizerFast,
+            ):
+                # HACK(label indices are swapped in RoBERTa pretrained model)
+                label_list[1], label_list[2] = label_list[2], label_list[1]
+            self.label_list = label_list
+
+            examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
+            self.features = hans_convert_examples_to_features(examples, label_list, max_seq_length, tokenizer)
+
+            def gen():
+                for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
+                    if ex_index % 10000 == 0:
+                        logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+                    yield (
+                        {
+                            "example_id": 0,
+                            "input_ids": ex.input_ids,
+                            "attention_mask": ex.attention_mask,
+                            "token_type_ids": ex.token_type_ids,
+                        },
+                        ex.label,
+                    )
+
+            self.dataset = tf.data.Dataset.from_generator(
+                gen,
+                (
+                    {
+                        "example_id": tf.int32,
+                        "input_ids": tf.int32,
+                        "attention_mask": tf.int32,
+                        "token_type_ids": tf.int32,
+                    },
+                    tf.int64,
+                ),
+                (
+                    {
+                        "example_id": tf.TensorShape([]),
+                        "input_ids": tf.TensorShape([None, None]),
+                        "attention_mask": tf.TensorShape([None, None]),
+                        "token_type_ids": tf.TensorShape([None, None]),
+                    },
+                    tf.TensorShape([]),
+                ),
+            )
+
+        def get_dataset(self):
+            return self.dataset
+
+        def __len__(self):
+            return len(self.features)
+
+        def __getitem__(self, i) -> InputFeatures:
+            return self.features[i]
+
+        def get_labels(self):
+            return self.label_list
+
+
+class HansProcessor(DataProcessor):
+    """Processor for the HANS data set."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[5]
+            text_b = line[6]
+            pairID = line[7][2:] if line[7].startswith("ex") else line[7]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID))
+        return examples
+
+
+def hans_convert_examples_to_features(
+    examples: List[InputExample], label_list: List[str], max_length: int, tokenizer: PreTrainedTokenizer,
+):
+    """
+    Loads a data file into a list of ``InputFeatures``
+
+    Args:
+        examples: List of ``InputExamples`` containing the examples.
+        tokenizer: Instance of a tokenizer that will tokenize the examples.
+        max_length: Maximum example length.
+        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method.
+        output_mode: String indicating the output mode. Either ``regression`` or ``classification``.
+
+    Returns:
+        A list of task-specific ``InputFeatures`` which can be fed to the model.
+
+    """
+
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    features = []
+    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d" % (ex_index))
+
+        inputs = tokenizer(
+            example.text_a,
+            example.text_b,
+            add_special_tokens=True,
+            max_length=max_length,
+            padding="max_length",
+            truncation=True,
+            return_overflowing_tokens=True,
+        )
+
+        label = label_map[example.label] if example.label in label_map else 0
+
+        pairID = int(example.pairID)
+
+        features.append(InputFeatures(**inputs, label=label, pairID=pairID))
+
+    for i, example in enumerate(examples[:5]):
+        logger.info("*** Example ***")
+        logger.info(f"guid: {example}")
+        logger.info(f"features: {features[i]}")
+
+    return features
+
+
+hans_tasks_num_labels = {
+    "hans": 3,
+}
+
+hans_processors = {
+    "hans": HansProcessor,
+}
--- a/examples/benchmarking/plot_csv_file.py
+++ b/examples/benchmarking/plot_csv_file.py
@@ -0,0 +1,160 @@
+import csv
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.ticker import ScalarFormatter
+
+from transformers import HfArgumentParser
+
+
+def list_field(default=None, metadata=None):
+    return field(default_factory=lambda: default, metadata=metadata)
+
+
+@dataclass
+class PlotArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+
+    csv_file: str = field(metadata={"help": "The csv file to plot."},)
+    plot_along_batch: bool = field(
+        default=False,
+        metadata={"help": "Whether to plot along batch size or sequence lengh. Defaults to sequence length."},
+    )
+    is_time: bool = field(
+        default=False,
+        metadata={"help": "Whether the csv file has time results or memory results. Defaults to memory results."},
+    )
+    no_log_scale: bool = field(
+        default=False, metadata={"help": "Disable logarithmic scale when plotting"},
+    )
+    is_train: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether the csv file has training results or inference results. Defaults to inference results."
+        },
+    )
+    figure_png_file: Optional[str] = field(
+        default=None, metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
+    )
+    short_model_names: Optional[List[str]] = list_field(
+        default=None, metadata={"help": "List of model names that are used instead of the ones in the csv file."}
+    )
+
+
+def can_convert_to_int(string):
+    try:
+        int(string)
+        return True
+    except ValueError:
+        return False
+
+
+def can_convert_to_float(string):
+    try:
+        float(string)
+        return True
+    except ValueError:
+        return False
+
+
+class Plot:
+    def __init__(self, args):
+        self.args = args
+        self.result_dict = defaultdict(lambda: dict(bsz=[], seq_len=[], result={}))
+
+        with open(self.args.csv_file, newline="") as csv_file:
+            reader = csv.DictReader(csv_file)
+            for row in reader:
+                model_name = row["model"]
+                self.result_dict[model_name]["bsz"].append(int(row["batch_size"]))
+                self.result_dict[model_name]["seq_len"].append(int(row["sequence_length"]))
+                if can_convert_to_int(row["result"]):
+                    # value is not None
+                    self.result_dict[model_name]["result"][
+                        (int(row["batch_size"]), int(row["sequence_length"]))
+                    ] = int(row["result"])
+                elif can_convert_to_float(row["result"]):
+                    # value is not None
+                    self.result_dict[model_name]["result"][
+                        (int(row["batch_size"]), int(row["sequence_length"]))
+                    ] = float(row["result"])
+
+    def plot(self):
+        fig, ax = plt.subplots()
+        title_str = "Time usage" if self.args.is_time else "Memory usage"
+        title_str = title_str + " for training" if self.args.is_train else title_str + " for inference"
+
+        if not self.args.no_log_scale:
+            # set logarithm scales
+            ax.set_xscale("log")
+            ax.set_yscale("log")
+
+        for axis in [ax.xaxis, ax.yaxis]:
+            axis.set_major_formatter(ScalarFormatter())
+
+        for model_name_idx, model_name in enumerate(self.result_dict.keys()):
+            batch_sizes = sorted(list(set(self.result_dict[model_name]["bsz"])))
+            sequence_lengths = sorted(list(set(self.result_dict[model_name]["seq_len"])))
+            results = self.result_dict[model_name]["result"]
+
+            (x_axis_array, inner_loop_array) = (
+                (batch_sizes, sequence_lengths) if self.args.plot_along_batch else (sequence_lengths, batch_sizes)
+            )
+
+            label_model_name = (
+                model_name if self.args.short_model_names is None else self.args.short_model_names[model_name_idx]
+            )
+
+            for inner_loop_value in inner_loop_array:
+                if self.args.plot_along_batch:
+                    y_axis_array = np.asarray(
+                        [results[(x, inner_loop_value)] for x in x_axis_array if (x, inner_loop_value) in results],
+                        dtype=np.int,
+                    )
+                else:
+                    y_axis_array = np.asarray(
+                        [results[(inner_loop_value, x)] for x in x_axis_array if (inner_loop_value, x) in results],
+                        dtype=np.float32,
+                    )
+
+                (x_axis_label, inner_loop_label) = (
+                    ("batch_size", "len") if self.args.plot_along_batch else ("in #tokens", "bsz")
+                )
+
+                x_axis_array = np.asarray(x_axis_array, np.int)[: len(y_axis_array)]
+                plt.scatter(
+                    x_axis_array, y_axis_array, label=f"{label_model_name} - {inner_loop_label}: {inner_loop_value}"
+                )
+                plt.plot(x_axis_array, y_axis_array, "--")
+
+            title_str += f" {label_model_name} vs."
+
+        title_str = title_str[:-4]
+        y_axis_label = "Time in s" if self.args.is_time else "Memory in MB"
+
+        # plot
+        plt.title(title_str)
+        plt.xlabel(x_axis_label)
+        plt.ylabel(y_axis_label)
+        plt.legend()
+
+        if self.args.figure_png_file is not None:
+            plt.savefig(self.args.figure_png_file)
+        else:
+            plt.show()
+
+
+def main():
+    parser = HfArgumentParser(PlotArguments)
+    plot_args = parser.parse_args_into_dataclasses()[0]
+    plot = Plot(args=plot_args)
+    plot.plot()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/benchmarking/run_benchmark.py
+++ b/examples/benchmarking/run_benchmark.py
@@ -0,0 +1,29 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Benchmarking the library on inference and training """
+
+from transformers import HfArgumentParser, PyTorchBenchmark, PyTorchBenchmarkArguments
+
+
+def main():
+    parser = HfArgumentParser(PyTorchBenchmarkArguments)
+    benchmark_args = parser.parse_args_into_dataclasses()[0]
+    benchmark = PyTorchBenchmark(args=benchmark_args)
+    benchmark.run()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/benchmarking/run_benchmark_tf.py
+++ b/examples/benchmarking/run_benchmark_tf.py
@@ -0,0 +1,29 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Benchmarking the library on inference and training in TensorFlow"""
+
+from transformers import HfArgumentParser, TensorFlowBenchmark, TensorFlowBenchmarkArguments
+
+
+def main():
+    parser = HfArgumentParser(TensorFlowBenchmarkArguments)
+    benchmark_args = parser.parse_args_into_dataclasses()[0]
+    benchmark = TensorFlowBenchmark(args=benchmark_args)
+    benchmark.run()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -1,664 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Benchmarking the library on inference and training """
-
-# If checking the tensors placement
-# tf.debugging.set_log_device_placement(True)
-
-import argparse
-import csv
-import timeit
-from time import time
-from typing import List
-
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
-    MemorySummary,
-    is_tf_available,
-    is_torch_available,
-    start_memory_tracing,
-    stop_memory_tracing,
-)
-
-
-if is_tf_available():
-    import tensorflow as tf
-    from transformers import TFAutoModel
-
-if is_torch_available():
-    import torch
-    from transformers import AutoModel
-
-
-input_text = """Bent over their instruments, three hundred Fertilizers were plunged, as
-the Director of Hatcheries and Conditioning entered the room, in the
-
-
-
-scarcely breathing silence, the absent-minded, soliloquizing hum or
-whistle, of absorbed concentration. A troop of newly arrived students,
-very young, pink and callow, followed nervously, rather abjectly, at the
-Director's heels. Each of them carried a notebook, in which, whenever
-the great man spoke, he desperately scribbled. Straight from the
-horse's mouth. It was a rare privilege. The D. H. C. for Central London
-always made a point of personally conducting his new students round
-the various departments.
-
-"Just to give you a general idea," he would explain to them. For of
-course some sort of general idea they must have, if they were to do
-their work intelligently-though as little of one, if they were to be good
-and happy members of society, as possible. For particulars, as every
-one knows, make for virtue and happiness; generalities are intellectu-
-ally necessary evils. Not philosophers but fret-sawyers and stamp col-
-lectors compose the backbone of society.
-
-"To-morrow," he would add, smiling at them with a slightly menacing
-geniality, "you'll be settling down to serious work. You won't have time
-for generalities. Meanwhile ..."
-
-Meanwhile, it was a privilege. Straight from the horse's mouth into the
-notebook. The boys scribbled like mad.
-
-Tall and rather thin but upright, the Director advanced into the room.
-He had a long chin and big rather prominent teeth, just covered, when
-he was not talking, by his full, floridly curved lips. Old, young? Thirty?
-Fifty? Fifty-five? It was hard to say. And anyhow the question didn't
-arise; in this year of stability, A. F. 632, it didn't occur to you to ask it.
-
-"I shall begin at the beginning," said the D.H.C. and the more zealous
-students recorded his intention in their notebooks: Begin at the begin-
-ning. "These," he waved his hand, "are the incubators." And opening
-an insulated door he showed them racks upon racks of numbered test-
-tubes. "The week's supply of ova. Kept," he explained, "at blood heat;
-whereas the male gametes," and here he opened another door, "they
-have to be kept at thirty-five instead of thirty-seven. Full blood heat
-sterilizes." Rams wrapped in theremogene beget no lambs.
-
-Still leaning against the incubators he gave them, while the pencils
-scurried illegibly across the pages, a brief description of the modern
-
-
-
-fertilizing process; spoke first, of course, of its surgical introduc-
-tion-"the operation undergone voluntarily for the good of Society, not
-to mention the fact that it carries a bonus amounting to six months'
-salary"; continued with some account of the technique for preserving
-the excised ovary alive and actively developing; passed on to a consid-
-eration of optimum temperature, salinity, viscosity; referred to the liq-
-uor in which the detached and ripened eggs were kept; and, leading
-his charges to the work tables, actually showed them how this liquor
-was drawn off from the test-tubes; how it was let out drop by drop
-onto the specially warmed slides of the microscopes; how the eggs
-which it contained were inspected for abnormalities, counted and
-transferred to a porous receptacle; how (and he now took them to
-watch the operation) this receptacle was immersed in a warm bouillon
-containing free-swimming spermatozoa-at a minimum concentration
-of one hundred thousand per cubic centimetre, he insisted; and how,
-after ten minutes, the container was lifted out of the liquor and its
-contents re-examined; how, if any of the eggs remained unfertilized, it
-was again immersed, and, if necessary, yet again; how the fertilized
-ova went back to the incubators; where the Alphas and Betas re-
-mained until definitely bottled; while the Gammas, Deltas and Epsilons
-were brought out again, after only thirty-six hours, to undergo Bo-
-kanovsky's Process.
-
-"Bokanovsky's Process," repeated the Director, and the students un-
-derlined the words in their little notebooks.
-
-One egg, one embryo, one adult-normality. But a bokanovskified egg
-will bud, will proliferate, will divide. From eight to ninety-six buds, and
-every bud will grow into a perfectly formed embryo, and every embryo
-into a full-sized adult. Making ninety-six human beings grow where
-only one grew before. Progress.
-
-"Essentially," the D.H.C. concluded, "bokanovskification consists of a
-series of arrests of development. We check the normal growth and,
-paradoxically enough, the egg responds by budding."
-
-Responds by budding. The pencils were busy.
-
-He pointed. On a very slowly moving band a rack-full of test-tubes was
-entering a large metal box, another, rack-full was emerging. Machinery
-faintly purred. It took eight minutes for the tubes to go through, he
-
-
-
-told them. Eight minutes of hard X-rays being about as much as an
-egg can stand. A few died; of the rest, the least susceptible divided
-into two; most put out four buds; some eight; all were returned to the
-incubators, where the buds began to develop; then, after two days,
-were suddenly chilled, chilled and checked. Two, four, eight, the buds
-in their turn budded; and having budded were dosed almost to death
-with alcohol; consequently burgeoned again and having budded-bud
-out of bud out of bud-were thereafter-further arrest being generally
-fatal-left to develop in peace. By which time the original egg was in a
-fair way to becoming anything from eight to ninety-six embryos- a
-prodigious improvement, you will agree, on nature. Identical twins-but
-not in piddling twos and threes as in the old viviparous days, when an
-egg would sometimes accidentally divide; actually by dozens, by
-scores at a time.
-
-"Scores," the Director repeated and flung out his arms, as though he
-were distributing largesse. "Scores."
-
-But one of the students was fool enough to ask where the advantage
-lay.
-
-"My good boy!" The Director wheeled sharply round on him. "Can't you
-see? Can't you see?" He raised a hand; his expression was solemn.
-"Bokanovsky's Process is one of the major instruments of social stabil-
-ity!"
-
-Major instruments of social stability.
-
-Standard men and women; in uniform batches. The whole of a small
-factory staffed with the products of a single bokanovskified egg.
-
-"Ninety-six identical twins working ninety-six identical machines!" The
-voice was almost tremulous with enthusiasm. "You really know where
-you are. For the first time in history." He quoted the planetary motto.
-"Community, Identity, Stability." Grand words. "If we could bo-
-kanovskify indefinitely the whole problem would be solved."
-
-Solved by standard Gammas, unvarying Deltas, uniform Epsilons. Mil-
-lions of identical twins. The principle of mass production at last applied
-to biology.
-
-
-
-"But, alas," the Director shook his head, "we can't bokanovskify indefi-
-nitely."
-
-Ninety-six seemed to be the limit; seventy-two a good average. From
-the same ovary and with gametes of the same male to manufacture as
-many batches of identical twins as possible-that was the best (sadly a
-second best) that they could do. And even that was difficult.
-
-"For in nature it takes thirty years for two hundred eggs to reach ma-
-turity. But our business is to stabilize the population at this moment,
-here and now. Dribbling out twins over a quarter of a century-what
-would be the use of that?"
-
-Obviously, no use at all. But Podsnap's Technique had immensely ac-
-celerated the process of ripening. They could make sure of at least a
-hundred and fifty mature eggs within two years. Fertilize and bo-
-kanovskify-in other words, multiply by seventy-two-and you get an
-average of nearly eleven thousand brothers and sisters in a hundred
-and fifty batches of identical twins, all within two years of the same
-age.
-
-"And in exceptional cases we can make one ovary yield us over fifteen
-thousand adult individuals."
-
-Beckoning to a fair-haired, ruddy young man who happened to be
-passing at the moment. "Mr. Foster," he called. The ruddy young man
-approached. "Can you tell us the record for a single ovary, Mr. Foster?"
-
-"Sixteen thousand and twelve in this Centre," Mr. Foster replied with-
-out hesitation. He spoke very quickly, had a vivacious blue eye, and
-took an evident pleasure in quoting figures. "Sixteen thousand and
-twelve; in one hundred and eighty-nine batches of identicals. But of
-course they've done much better," he rattled on, "in some of the tropi-
-cal Centres. Singapore has often produced over sixteen thousand five
-hundred; and Mombasa has actually touched the seventeen thousand
-mark. But then they have unfair advantages. You should see the way a
-negro ovary responds to pituitary! It's quite astonishing, when you're
-used to working with European material. Still," he added, with a laugh
-(but the light of combat was in his eyes and the lift of his chin was
-challenging), "still, we mean to beat them if we can. I'm working on a
-wonderful Delta-Minus ovary at this moment. Only just eighteen
-
-
-
-months old. Over twelve thousand seven hundred children already, ei-
-ther decanted or in embryo. And still going strong. We'll beat them
-yet."
-
-"That's the spirit I like!" cried the Director, and clapped Mr. Foster on
-the shoulder. "Come along with us, and give these boys the benefit of
-your expert knowledge."
-
-Mr. Foster smiled modestly. "With pleasure." They went.
-In the Bottling Room all was harmonious bustle and ordered activity.
-Flaps of fresh sow's peritoneum ready cut to the proper size came
-shooting up in little lifts from the Organ Store in the sub-basement.
-Whizz and then, click! the lift-hatches hew open; the bottle-liner had
-only to reach out a hand, take the flap, insert, smooth-down, and be-
-fore the lined bottle had had time to travel out of reach along the end-
-less band, whizz, click! another flap of peritoneum had shot up from
-the depths, ready to be slipped into yet another bottle, the next of that
-slow interminable procession on the band.
-
-Next to the Liners stood the Matriculators. The procession advanced;
-one by one the eggs were transferred from their test-tubes to the
-larger containers; deftly the peritoneal lining was slit, the morula
-dropped into place, the saline solution poured in ... and already the
-bottle had passed, and it was the turn of the labellers. Heredity, date
-of fertilization, membership of Bokanovsky Group-details were trans-
-ferred from test-tube to bottle. No longer anonymous, but named,
-identified, the procession marched slowly on; on through an opening in
-the wall, slowly on into the Social Predestination Room.
-"Eighty-eight cubic metres of card-index," said Mr. Foster with relish,
-as they entered."""
-
-
-def create_setup_and_compute(
-    model_names: List[str],
-    batch_sizes: List[int],
-    slice_sizes: List[int],
-    gpu: bool = True,
-    tensorflow: bool = False,
-    average_over: int = 3,
-    no_speed: bool = False,
-    no_memory: bool = False,
-    verbose: bool = False,
-    torchscript: bool = False,
-    xla: bool = False,
-    amp: bool = False,
-    fp16: bool = False,
-    save_to_csv: bool = False,
-    csv_filename: str = f"results_{round(time())}.csv",
-    csv_memory_filename: str = f"memory_{round(time())}.csv",
-):
-    if xla:
-        tf.config.optimizer.set_jit(True)
-    if amp:
-        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
-
-    if tensorflow:
-        dictionary = {model_name: {} for model_name in model_names}
-        results = _compute_tensorflow(
-            model_names, batch_sizes, slice_sizes, dictionary, average_over, amp, no_speed, no_memory, verbose
-        )
-    else:
-        device = "cuda" if (gpu and torch.cuda.is_available()) else "cpu"
-        dictionary = {model_name: {} for model_name in model_names}
-        results = _compute_pytorch(
-            model_names,
-            batch_sizes,
-            slice_sizes,
-            dictionary,
-            average_over,
-            device,
-            torchscript,
-            fp16,
-            no_speed,
-            no_memory,
-            verbose,
-        )
-
-    print("=========== RESULTS ===========")
-    for model_name in model_names:
-        print("\t" + f"======= MODEL CHECKPOINT: {model_name} =======")
-        for batch_size in results[model_name]["bs"]:
-            print("\t\t" + f"===== BATCH SIZE: {batch_size} =====")
-            for slice_size in results[model_name]["ss"]:
-                result = results[model_name]["results"][batch_size][slice_size]
-                memory = results[model_name]["memory"][batch_size][slice_size]
-                if isinstance(result, str):
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result} " f"{memory}")
-                else:
-                    print(
-                        f"\t\t{model_name}/{batch_size}/{slice_size}: "
-                        f"{(round(1000 * result) / 1000)}"
-                        f"s "
-                        f"{memory}"
-                    )
-
-    if save_to_csv:
-        with open(csv_filename, mode="w") as csv_file, open(csv_memory_filename, mode="w") as csv_memory_file:
-            fieldnames = [
-                "model",
-                "1x8",
-                "1x64",
-                "1x128",
-                "1x256",
-                "1x512",
-                "1x1024",
-                "2x8",
-                "2x64",
-                "2x128",
-                "2x256",
-                "2x512",
-                "2x1024",
-                "4x8",
-                "4x64",
-                "4x128",
-                "4x256",
-                "4x512",
-                "4x1024",
-                "8x8",
-                "8x64",
-                "8x128",
-                "8x256",
-                "8x512",
-                "8x1024",
-            ]
-
-            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
-            writer.writeheader()
-            memory_writer = csv.DictWriter(csv_memory_file, fieldnames=fieldnames)
-            memory_writer.writeheader()
-
-            for model_name in model_names:
-                model_results = {
-                    f"{bs}x{ss}": results[model_name]["results"][bs][ss]
-                    for bs in results[model_name]["results"]
-                    for ss in results[model_name]["results"][bs]
-                }
-                writer.writerow({"model": model_name, **model_results})
-
-                model_memory_results = {
-                    f"{bs}x{ss}": results[model_name]["memory"][bs][ss]
-                    for bs in results[model_name]["memory"]
-                    for ss in results[model_name]["memory"][bs]
-                }
-                memory_writer.writerow({"model": model_name, **model_memory_results})
-
-
-def print_summary_statistics(summary: MemorySummary):
-    print(
-        "\nLines by line memory consumption:\n"
-        + "\n".join(
-            f"{state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
-            for state in summary.sequential
-        )
-    )
-    print(
-        "\nLines with top memory consumption:\n"
-        + "\n".join(
-            f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
-            for state in summary.cumulative[:6]
-        )
-    )
-    print(
-        "\nLines with lowest memory consumption:\n"
-        + "\n".join(
-            f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
-            for state in summary.cumulative[-6:]
-        )
-    )
-    print(f"\nTotal memory increase: {summary.total}")
-
-
-def _compute_pytorch(
-    model_names,
-    batch_sizes,
-    slice_sizes,
-    dictionary,
-    average_over,
-    device,
-    torchscript,
-    fp16,
-    no_speed,
-    no_memory,
-    verbose,
-):
-    for c, model_name in enumerate(model_names):
-        print(f"{c + 1} / {len(model_names)}")
-        config = AutoConfig.from_pretrained(model_name, torchscript=torchscript)
-        model = AutoModel.from_pretrained(model_name, config=config)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
-
-        max_input_size = tokenizer.max_model_input_sizes[model_name]
-
-        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}, "memory": {}}
-        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
-        dictionary[model_name]["memory"] = {i: {} for i in batch_sizes}
-
-        for batch_size in batch_sizes:
-            if fp16:
-                model.half()
-            model.to(device)
-            model.eval()
-
-            for slice_size in slice_sizes:
-                if max_input_size is not None and slice_size > max_input_size:
-                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-                else:
-                    sequence = torch.tensor(tokenized_sequence[:slice_size], device=device).repeat(batch_size, 1)
-                    try:
-                        if torchscript:
-                            print("Tracing model with sequence size", sequence.shape)
-                            inference = torch.jit.trace(model, sequence)
-                            inference(sequence)
-                        else:
-                            inference = model
-                            inference(sequence)
-
-                        if not no_memory:
-                            # model.add_memory_hooks()  # Forward method tracing (only for PyTorch models)
-
-                            # Line by line memory tracing (all code in the module `transformers`) works for all models/arbitrary code
-                            trace = start_memory_tracing("transformers")
-                            inference(sequence)
-                            summary = stop_memory_tracing(trace)
-
-                            if verbose:
-                                print_summary_statistics(summary)
-
-                            dictionary[model_name]["memory"][batch_size][slice_size] = str(summary.total)
-                        else:
-                            dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
-
-                        if not no_speed:
-                            print("Going through model with sequence of shape", sequence.shape)
-                            runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                            average_time = sum(runtimes) / float(len(runtimes)) / 3.0
-                            dictionary[model_name]["results"][batch_size][slice_size] = average_time
-                        else:
-                            dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-
-                    except RuntimeError as e:
-                        print("Doesn't fit on GPU.", e)
-                        torch.cuda.empty_cache()
-                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-                        dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
-    return dictionary
-
-
-def _compute_tensorflow(
-    model_names, batch_sizes, slice_sizes, dictionary, average_over, amp, no_speed, no_memory, verbose
-):
-    for c, model_name in enumerate(model_names):
-        print(f"{c + 1} / {len(model_names)}")
-        config = AutoConfig.from_pretrained(model_name)
-        model = TFAutoModel.from_pretrained(model_name, config=config)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
-
-        max_input_size = tokenizer.max_model_input_sizes[model_name]
-
-        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}, "memory": {}}
-        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
-        dictionary[model_name]["memory"] = {i: {} for i in batch_sizes}
-
-        print("Using model", model)
-
-        @tf.function
-        def inference(inputs):
-            return model(inputs)
-
-        for batch_size in batch_sizes:
-            for slice_size in slice_sizes:
-                if max_input_size is not None and slice_size > max_input_size:
-                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-                else:
-                    sequence = tf.stack(
-                        [tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size
-                    )
-
-                    try:
-                        print("Going through model with sequence of shape", sequence.shape)
-                        # To make sure that the model is traced + that the tensors are on the appropriate device
-                        inference(sequence)
-
-                        if not no_memory:
-                            # Line by line memory tracing (all code in the module `transformers`) works for all models/arbitrary code
-                            trace = start_memory_tracing("transformers")
-                            inference(sequence)
-                            summary = stop_memory_tracing(trace)
-
-                            if verbose:
-                                print_summary_statistics(summary)
-
-                            dictionary[model_name]["memory"][batch_size][slice_size] = str(summary.total)
-                        else:
-                            dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
-
-                        if not no_speed:
-                            runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                            average_time = sum(runtimes) / float(len(runtimes)) / 3.0
-                            dictionary[model_name]["results"][batch_size][slice_size] = average_time
-                        else:
-                            dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-
-                    except tf.errors.ResourceExhaustedError as e:
-                        print("Doesn't fit on GPU.", e)
-                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-                        dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
-    return dictionary
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--models",
-        required=False,
-        type=str,
-        default="all",
-        help="Model checkpoints to be provided "
-        "to the AutoModel classes. Leave "
-        "blank to benchmark the base version "
-        "of all available model "
-        "architectures.",
-    )
-    parser.add_argument("--verbose", required=False, action="store_true", help="Verbose memory tracing")
-    parser.add_argument("--no_speed", required=False, action="store_true", help="Don't perform speed measurments")
-    parser.add_argument("--no_memory", required=False, action="store_true", help="Don't perform memory measurments")
-    parser.add_argument(
-        "--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the " "models"
-    )
-    parser.add_argument(
-        "--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available " "cuda devices"
-    )
-    parser.add_argument(
-        "--torchscript",
-        required=False,
-        action="store_true",
-        help="Pytorch only: trace the models " "using torchscript",
-    )
-    parser.add_argument(
-        "--tensorflow",
-        required=False,
-        action="store_true",
-        help="Benchmark the TensorFlow version "
-        "of the models. Will run on GPU if "
-        "the correct dependencies are "
-        "installed",
-    )
-    parser.add_argument("--xla", required=False, action="store_true", help="TensorFlow only: use XLA acceleration.")
-    parser.add_argument(
-        "--amp",
-        required=False,
-        action="store_true",
-        help="TensorFlow only: use automatic mixed precision acceleration.",
-    )
-    parser.add_argument(
-        "--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference."
-    )
-    parser.add_argument(
-        "--keras_predict",
-        required=False,
-        action="store_true",
-        help="Whether to use model.predict " "instead of model() to do a " "forward pass.",
-    )
-    parser.add_argument("--save_to_csv", required=False, action="store_true", help="Save to a CSV file.")
-    parser.add_argument(
-        "--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv."
-    )
-    parser.add_argument(
-        "--average_over", required=False, default=30, type=int, help="Times an experiment will be run."
-    )
-    parser.add_argument("--batch_sizes", nargs="+", type=int, default=[1, 2, 4, 8])
-    parser.add_argument("--slice_sizes", nargs="+", type=int, default=[8, 64, 128, 256, 512, 1024])
-
-    args = parser.parse_args()
-    if args.models == "all":
-        args.models = [
-            "gpt2",
-            "bert-base-cased",
-            "xlnet-base-cased",
-            "xlm-mlm-en-2048",
-            "transfo-xl-wt103",
-            "openai-gpt",
-            "distilbert-base-uncased",
-            "distilgpt2",
-            "roberta-base",
-            "ctrl",
-        ]
-    else:
-        args.models = args.models.split()
-
-    print("Running with arguments", args)
-
-    if args.torch:
-        if is_torch_available():
-            create_setup_and_compute(
-                model_names=args.models,
-                batch_sizes=args.batch_sizes,
-                slice_sizes=args.slice_sizes,
-                tensorflow=False,
-                gpu=args.torch_cuda,
-                torchscript=args.torchscript,
-                fp16=args.fp16,
-                save_to_csv=args.save_to_csv,
-                csv_filename=args.csv_filename,
-                average_over=args.average_over,
-                no_speed=args.no_speed,
-                no_memory=args.no_memory,
-                verbose=args.verbose,
-            )
-        else:
-            raise ImportError("Trying to run a PyTorch benchmark but PyTorch was not found in the environment.")
-
-    if args.tensorflow:
-        if is_tf_available():
-            create_setup_and_compute(
-                model_names=args.models,
-                batch_sizes=args.batch_sizes,
-                slice_sizes=args.slice_sizes,
-                tensorflow=True,
-                xla=args.xla,
-                amp=args.amp,
-                save_to_csv=args.save_to_csv,
-                csv_filename=args.csv_filename,
-                average_over=args.average_over,
-                no_speed=args.no_speed,
-                no_memory=args.no_memory,
-                verbose=args.verbose,
-            )
-        else:
-            raise ImportError("Trying to run a TensorFlow benchmark but TensorFlow was not found in the environment.")
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/bert-loses-patience/README.md
+++ b/examples/bert-loses-patience/README.md
@@ -0,0 +1,89 @@
+# Patience-based Early Exit
+
+Patience-based Early Exit (PABEE) is a plug-and-play inference method for pretrained language models.
+We have already implemented it on BERT and ALBERT. Basically, you can make your LM faster and more robust with PABEE. It can even improve the performance of ALBERT on GLUE. The only sacrifice is that the batch size can only be 1.
+Learn more in the paper ["BERT Loses Patience: Fast and Robust Inference with Early Exit"](https://arxiv.org/abs/2006.04152) and the official [GitHub repo](https://github.com/JetRunner/PABEE).
+
+![PABEE](https://github.com/JetRunner/PABEE/raw/master/bert-loses-patience.png)
+
+## Training
+
+You can fine-tune a pretrained language model (you can choose from BERT and ALBERT) and train the internal classifiers by:
+```bash
+export GLUE_DIR=/path/to/glue_data
+export TASK_NAME=MRPC
+
+python ./run_glue_with_pabee.py \
+  --model_type albert \
+  --model_name_or_path bert-base-uncased/albert-base-v2 \
+  --task_name $TASK_NAME \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir "$GLUE_DIR/$TASK_NAME" \
+  --max_seq_length 128 \
+  --per_gpu_train_batch_size 32 \
+  --per_gpu_eval_batch_size 32 \
+  --learning_rate 2e-5 \
+  --save_steps 50 \
+  --logging_steps 50 \
+  --num_train_epochs 5 \
+  --output_dir /path/to/save/ \
+  --evaluate_during_training
+```
+
+## Inference
+
+You can inference with different patience settings by:
+```bash
+export GLUE_DIR=/path/to/glue_data
+export TASK_NAME=MRPC
+
+python ./run_glue_with_pabee.py \
+  --model_type albert \
+  --model_name_or_path /path/to/save/ \
+  --task_name $TASK_NAME \
+  --do_eval \
+  --do_lower_case \
+  --data_dir "$GLUE_DIR/$TASK_NAME" \
+  --max_seq_length 128 \
+  --per_gpu_eval_batch_size 1 \
+  --learning_rate 2e-5 \
+  --logging_steps 50 \
+  --num_train_epochs 15 \
+  --output_dir /path/to/save/ \
+  --eval_all_checkpoints \
+  --patience 3,4,5,6,7,8
+```
+where `patience` can be a list of patience settings, separated by a comma. It will help determine which patience works best.
+
+When evaluating on a regression task (STS-B), you may add `--regression_threshold 0.1` to define the regression threshold.
+
+## Results
+On the GLUE dev set:
+
+| Model        | \#Param | Speed  | CoLA  | MNLI  | MRPC  | QNLI  | QQP   | RTE   | SST\-2 | STS\-B |
+|--------------|---------|--------|-------|-------|-------|-------|-------|-------|--------|--------|
+| ALBERT\-base | 12M     |        | 58\.9 | 84\.6 | 89\.5 | 91\.7 | 89\.6 | 78\.6 | 92\.8  | 89\.5  |
+| \+PABEE      | 12M     | 1\.57x | 61\.2 | 85\.1 | 90\.0 | 91\.8 | 89\.6 | 80\.1 | 93\.0  | 90\.1  |
+
+| Model         | \#Param | Speed\-up | MNLI  | SST\-2 | STS\-B |
+|---------------|---------|-----------|-------|--------|--------|
+| BERT\-base    | 108M    |           | 84\.5 | 92\.1  | 88\.9  |
+| \+PABEE       | 108M    | 1\.62x    | 83\.6 | 92\.0  | 88\.7  |
+| ALBERT\-large | 18M     |           | 86\.4 | 94\.9  | 90\.4  |
+| \+PABEE       | 18M     | 2\.42x    | 86\.8 | 95\.2  | 90\.6  |
+
+
+## Citation
+If you find this resource useful, please consider citing the following paper:
+```bibtex
+@misc{zhou2020bert,
+    title={BERT Loses Patience: Fast and Robust Inference with Early Exit},
+    author={Wangchunshu Zhou and Canwen Xu and Tao Ge and Julian McAuley and Ke Xu and Furu Wei},
+    year={2020},
+    eprint={2006.04152},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
--- a/examples/bert-loses-patience/pabee/init.py
+++ b/examples/bert-loses-patience/pabee/init.py
--- a/examples/bert-loses-patience/pabee/modeling_pabee_albert.py
+++ b/examples/bert-loses-patience/pabee/modeling_pabee_albert.py
@@ -0,0 +1,310 @@
+# coding=utf-8
+# Copyright 2020 Google AI, Google Brain, the HuggingFace Inc. team and Microsoft Corporation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ALBERT model with Patience-based Early Exit. """
+
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from transformers.modeling_albert import (
+    ALBERT_INPUTS_DOCSTRING,
+    ALBERT_START_DOCSTRING,
+    AlbertModel,
+    AlbertPreTrainedModel,
+    AlbertTransformer,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+class AlbertTransformerWithPabee(AlbertTransformer):
+    def adaptive_forward(self, hidden_states, current_layer, attention_mask=None, head_mask=None):
+        if current_layer == 0:
+            hidden_states = self.embedding_hidden_mapping_in(hidden_states)
+        else:
+            hidden_states = hidden_states[0]
+
+        layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups)
+
+        # Index of the hidden group
+        group_idx = int(current_layer / (self.config.num_hidden_layers / self.config.num_hidden_groups))
+
+        layer_group_output = self.albert_layer_groups[group_idx](
+            hidden_states,
+            attention_mask,
+            head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
+        )
+        hidden_states = layer_group_output[0]
+
+        return (hidden_states,)
+
+
+@add_start_docstrings(
+    "The bare ALBERT Model transformer with PABEE outputting raw hidden-states without any specific head on top.",
+    ALBERT_START_DOCSTRING,
+)
+class AlbertModelWithPabee(AlbertModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.encoder = AlbertTransformerWithPabee(config)
+
+        self.init_weights()
+        self.patience = 0
+        self.inference_instances_num = 0
+        self.inference_layers_num = 0
+
+        self.regression_threshold = 0
+
+    def set_regression_threshold(self, threshold):
+        self.regression_threshold = threshold
+
+    def set_patience(self, patience):
+        self.patience = patience
+
+    def reset_stats(self):
+        self.inference_instances_num = 0
+        self.inference_layers_num = 0
+
+    def log_stats(self):
+        avg_inf_layers = self.inference_layers_num / self.inference_instances_num
+        message = f"*** Patience = {self.patience} Avg. Inference Layers = {avg_inf_layers:.2f} Speed Up = {1 - avg_inf_layers / self.config.num_hidden_layers:.2f} ***"
+        print(message)
+
+    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_dropout=None,
+        output_layers=None,
+        regression=False,
+    ):
+        r"""
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during pre-training.
+
+            This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+        encoder_outputs = embedding_output
+
+        if self.training:
+            res = []
+            for i in range(self.config.num_hidden_layers):
+                encoder_outputs = self.encoder.adaptive_forward(
+                    encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask,
+                )
+
+                pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
+                logits = output_layers[i](output_dropout(pooled_output))
+                res.append(logits)
+        elif self.patience == 0:  # Use all layers for inference
+            encoder_outputs = self.encoder(encoder_outputs, extended_attention_mask, head_mask=head_mask)
+            pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
+            res = [output_layers[self.config.num_hidden_layers - 1](pooled_output)]
+        else:
+            patient_counter = 0
+            patient_result = None
+            calculated_layer_num = 0
+            for i in range(self.config.num_hidden_layers):
+                calculated_layer_num += 1
+                encoder_outputs = self.encoder.adaptive_forward(
+                    encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask,
+                )
+
+                pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
+                logits = output_layers[i](pooled_output)
+                if regression:
+                    labels = logits.detach()
+                    if patient_result is not None:
+                        patient_labels = patient_result.detach()
+                    if (patient_result is not None) and torch.abs(patient_result - labels) < self.regression_threshold:
+                        patient_counter += 1
+                    else:
+                        patient_counter = 0
+                else:
+                    labels = logits.detach().argmax(dim=1)
+                    if patient_result is not None:
+                        patient_labels = patient_result.detach().argmax(dim=1)
+                    if (patient_result is not None) and torch.all(labels.eq(patient_labels)):
+                        patient_counter += 1
+                    else:
+                        patient_counter = 0
+
+                patient_result = logits
+                if patient_counter == self.patience:
+                    break
+            res = [patient_result]
+            self.inference_layers_num += calculated_layer_num
+            self.inference_instances_num += 1
+
+        return res
+
+
+@add_start_docstrings(
+    """Albert Model transformer with PABEE and a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    ALBERT_START_DOCSTRING,
+)
+class AlbertForSequenceClassificationWithPabee(AlbertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.albert = AlbertModelWithPabee(config)
+        self.dropout = nn.Dropout(config.classifier_dropout_prob)
+        self.classifiers = nn.ModuleList(
+            [nn.Linear(config.hidden_size, self.config.num_labels) for _ in range(config.num_hidden_layers)]
+        )
+
+        self.init_weights()
+
+    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
+        loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+
+        Examples::
+
+            from transformers import AlbertTokenizer
+            from pabee import AlbertForSequenceClassificationWithPabee
+            import torch
+
+            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+            model = AlbertForSequenceClassificationWithPabee.from_pretrained('albert-base-v2')
+            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+            labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+            outputs = model(input_ids, labels=labels)
+            loss, logits = outputs[:2]
+
+        """
+
+        logits = self.albert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_dropout=self.dropout,
+            output_layers=self.classifiers,
+            regression=self.num_labels == 1,
+        )
+
+        outputs = (logits[-1],)
+
+        if labels is not None:
+            total_loss = None
+            total_weights = 0
+            for ix, logits_item in enumerate(logits):
+                if self.num_labels == 1:
+                    #  We are doing regression
+                    loss_fct = MSELoss()
+                    loss = loss_fct(logits_item.view(-1), labels.view(-1))
+                else:
+                    loss_fct = CrossEntropyLoss()
+                    loss = loss_fct(logits_item.view(-1, self.num_labels), labels.view(-1))
+                if total_loss is None:
+                    total_loss = loss
+                else:
+                    total_loss += loss * (ix + 1)
+                total_weights += ix + 1
+            outputs = (total_loss / total_weights,) + outputs
+
+        return outputs
--- a/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
+++ b/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
@@ -0,0 +1,342 @@
+# coding=utf-8
+# Copyright 2020 The Google AI Language Team Authors, The HuggingFace Inc. team and Microsoft Corporation.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model with Patience-based Early Exit. """
+
+
+import logging
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from transformers.modeling_bert import (
+    BERT_INPUTS_DOCSTRING,
+    BERT_START_DOCSTRING,
+    BertEncoder,
+    BertModel,
+    BertPreTrainedModel,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+class BertEncoderWithPabee(BertEncoder):
+    def adaptive_forward(self, hidden_states, current_layer, attention_mask=None, head_mask=None):
+        layer_outputs = self.layer[current_layer](hidden_states, attention_mask, head_mask[current_layer])
+
+        hidden_states = layer_outputs[0]
+
+        return hidden_states
+
+
+@add_start_docstrings(
+    "The bare Bert Model transformer with PABEE outputting raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+)
+class BertModelWithPabee(BertModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well
+    as a decoder, in which case a layer of cross-attention is added between
+    the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
+    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the
+    :obj:`is_decoder` argument of the configuration set to :obj:`True`; an
+    :obj:`encoder_hidden_states` is expected as an input to the forward pass.
+
+    .. _`Attention is all you need`:
+        https://arxiv.org/abs/1706.03762
+
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.encoder = BertEncoderWithPabee(config)
+
+        self.init_weights()
+        self.patience = 0
+        self.inference_instances_num = 0
+        self.inference_layers_num = 0
+
+        self.regression_threshold = 0
+
+    def set_regression_threshold(self, threshold):
+        self.regression_threshold = threshold
+
+    def set_patience(self, patience):
+        self.patience = patience
+
+    def reset_stats(self):
+        self.inference_instances_num = 0
+        self.inference_layers_num = 0
+
+    def log_stats(self):
+        avg_inf_layers = self.inference_layers_num / self.inference_instances_num
+        message = f"*** Patience = {self.patience} Avg. Inference Layers = {avg_inf_layers:.2f} Speed Up = {1 - avg_inf_layers / self.config.num_hidden_layers:.2f} ***"
+        print(message)
+
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_dropout=None,
+        output_layers=None,
+        regression=False,
+    ):
+        r"""
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during pre-training.
+
+            This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D ou 3D attention mask is provided for the cross-attention
+        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+        encoder_outputs = embedding_output
+
+        if self.training:
+            res = []
+            for i in range(self.config.num_hidden_layers):
+                encoder_outputs = self.encoder.adaptive_forward(
+                    encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask
+                )
+
+                pooled_output = self.pooler(encoder_outputs)
+                logits = output_layers[i](output_dropout(pooled_output))
+                res.append(logits)
+        elif self.patience == 0:  # Use all layers for inference
+            encoder_outputs = self.encoder(
+                embedding_output,
+                attention_mask=extended_attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_extended_attention_mask,
+            )
+            pooled_output = self.pooler(encoder_outputs[0])
+            res = [output_layers[self.config.num_hidden_layers - 1](pooled_output)]
+        else:
+            patient_counter = 0
+            patient_result = None
+            calculated_layer_num = 0
+            for i in range(self.config.num_hidden_layers):
+                calculated_layer_num += 1
+                encoder_outputs = self.encoder.adaptive_forward(
+                    encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask
+                )
+
+                pooled_output = self.pooler(encoder_outputs)
+                logits = output_layers[i](pooled_output)
+                if regression:
+                    labels = logits.detach()
+                    if patient_result is not None:
+                        patient_labels = patient_result.detach()
+                    if (patient_result is not None) and torch.abs(patient_result - labels) < self.regression_threshold:
+                        patient_counter += 1
+                    else:
+                        patient_counter = 0
+                else:
+                    labels = logits.detach().argmax(dim=1)
+                    if patient_result is not None:
+                        patient_labels = patient_result.detach().argmax(dim=1)
+                    if (patient_result is not None) and torch.all(labels.eq(patient_labels)):
+                        patient_counter += 1
+                    else:
+                        patient_counter = 0
+
+                patient_result = logits
+                if patient_counter == self.patience:
+                    break
+            res = [patient_result]
+            self.inference_layers_num += calculated_layer_num
+            self.inference_instances_num += 1
+
+        return res
+
+
+@add_start_docstrings(
+    """Bert Model transformer with PABEE and a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    BERT_START_DOCSTRING,
+)
+class BertForSequenceClassificationWithPabee(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModelWithPabee(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifiers = nn.ModuleList(
+            [nn.Linear(config.hidden_size, self.config.num_labels) for _ in range(config.num_hidden_layers)]
+        )
+
+        self.init_weights()
+
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+
+    Examples::
+
+        from transformers import BertTokenizer, BertForSequenceClassification
+        from pabee import BertForSequenceClassificationWithPabee
+        import torch
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertForSequenceClassificationWithPabee.from_pretrained('bert-base-uncased')
+
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+
+        loss, logits = outputs[:2]
+
+        """
+
+        logits = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_dropout=self.dropout,
+            output_layers=self.classifiers,
+            regression=self.num_labels == 1,
+        )
+
+        outputs = (logits[-1],)
+
+        if labels is not None:
+            total_loss = None
+            total_weights = 0
+            for ix, logits_item in enumerate(logits):
+                if self.num_labels == 1:
+                    #  We are doing regression
+                    loss_fct = MSELoss()
+                    loss = loss_fct(logits_item.view(-1), labels.view(-1))
+                else:
+                    loss_fct = CrossEntropyLoss()
+                    loss = loss_fct(logits_item.view(-1, self.num_labels), labels.view(-1))
+                if total_loss is None:
+                    total_loss = loss
+                else:
+                    total_loss += loss * (ix + 1)
+                total_weights += ix + 1
+            outputs = (total_loss / total_weights,) + outputs
+
+        return outputs
--- a/examples/bert-loses-patience/run_glue_with_pabee.py
+++ b/examples/bert-loses-patience/run_glue_with_pabee.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright 2020 The Google AI Language Team Authors, The HuggingFace Inc. team and Microsoft Corporation.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa, Albert, XLM-RoBERTa)."""
+""" Training and inference using the library models for sequence classification on GLUE (Bert, Albert) with PABEE."""


 import argparse
@@ -29,13 +29,15 @@ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Tenso
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange

+from pabee.modeling_pabee_albert import AlbertForSequenceClassificationWithPabee
+from pabee.modeling_pabee_bert import BertForSequenceClassificationWithPabee
 from transformers import (
-    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
    WEIGHTS_NAME,
    AdamW,
-    AutoConfig,
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
+    AlbertConfig,
+    AlbertTokenizer,
+    BertConfig,
+    BertTokenizer,
    get_linear_schedule_with_warmup,
 )
 from transformers import glue_compute_metrics as compute_metrics
@@ -52,10 +54,10 @@ except ImportError:

 logger = logging.getLogger(__name__)

-MODEL_CONFIG_CLASSES = list(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in MODEL_CONFIG_CLASSES), (),)
+MODEL_CLASSES = {
+    "bert": (BertConfig, BertForSequenceClassificationWithPabee, BertTokenizer),
+    "albert": (AlbertConfig, AlbertForSequenceClassificationWithPabee, AlbertTokenizer),
+}


 def set_seed(args):
@@ -140,18 +142,17 @@ def train(args, train_dataset, model, tokenizer):
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
-        # set global_step to global_step of last saved checkpoint from model path
-        try:
-            global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
-        except ValueError:
-            global_step = 0
+        # set global_step to gobal_step of last saved checkpoint from model path
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
-        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
+        logger.info(
+            "  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch,
+        )

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
@@ -170,11 +171,12 @@ def train(args, train_dataset, model, tokenizer):

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = (
-                    batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None
-                )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "labels": batch[3],
+            }
+            inputs["token_type_ids"] = batch[2]
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

@@ -190,11 +192,7 @@ def train(args, train_dataset, model, tokenizer):
                loss.backward()

            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0 or (
-                # last step in epoch but step is always smaller than gradient_accumulation_steps
-                len(epoch_iterator) <= args.gradient_accumulation_steps
-                and (step + 1) == len(epoch_iterator)
-            ):
+            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
@@ -228,8 +226,6 @@ def train(args, train_dataset, model, tokenizer):
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
@@ -256,7 +252,19 @@ def train(args, train_dataset, model, tokenizer):
    return global_step, tr_loss / global_step


-def evaluate(args, model, tokenizer, prefix=""):
+def evaluate(args, model, tokenizer, prefix="", patience=0):
+
+    if args.model_type == "albert":
+        model.albert.set_regression_threshold(args.regression_threshold)
+        model.albert.set_patience(patience)
+        model.albert.reset_stats()
+    elif args.model_type == "bert":
+        model.bert.set_regression_threshold(args.regression_threshold)
+        model.bert.set_patience(patience)
+        model.bert.reset_stats()
+    else:
+        raise NotImplementedError()
+
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
    eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
@@ -290,11 +298,12 @@ def evaluate(args, model, tokenizer, prefix=""):
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
-                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-                if args.model_type != "distilbert":
-                    inputs["token_type_ids"] = (
-                        batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None
-                    )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
+                inputs = {
+                    "input_ids": batch[0],
+                    "attention_mask": batch[1],
+                    "labels": batch[3],
+                }
+                inputs["token_type_ids"] = batch[2]
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

@@ -320,8 +329,17 @@ def evaluate(args, model, tokenizer, prefix=""):
            logger.info("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
+                print("  %s = %s" % (key, str(result[key])))
                writer.write("%s = %s\n" % (key, str(result[key])))

+    if args.eval_all_checkpoints and patience != 0:
+        if args.model_type == "albert":
+            model.albert.log_stats()
+        elif args.model_type == "bert":
+            model.bert.log_stats()
+        else:
+            raise NotImplementedError()
+
    return results


@@ -354,14 +372,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )
        features = convert_examples_to_features(
-            examples,
-            tokenizer,
-            label_list=label_list,
-            max_length=args.max_seq_length,
-            output_mode=output_mode,
-            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+            examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -399,14 +410,14 @@ def main():
        default=None,
        type=str,
        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_TYPES),
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pre-trained model or shortcut name.",
    )
    parser.add_argument(
        "--task_name",
@@ -422,6 +433,12 @@ def main():
        required=True,
        help="The output directory where the model predictions and checkpoints will be written.",
    )
+    parser.add_argument(
+        "--patience", default="0", type=str, required=False,
+    )
+    parser.add_argument(
+        "--regression_threshold", default=0, type=float, required=False,
+    )

    # Other parameters
    parser.add_argument(
@@ -459,7 +476,7 @@ def main():
        "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
    )
    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.",
+        "--per_gpu_eval_batch_size", default=1, type=int, help="Batch size per GPU/CPU for evaluation.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
@@ -467,7 +484,9 @@ def main():
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.",
+    )
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
@@ -483,7 +502,9 @@ def main():
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.",
+    )
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
@@ -510,7 +531,9 @@ def main():
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--local_rank", type=int, default=-1, help="For distributed training: local_rank",
+    )
    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args()
@@ -539,7 +562,7 @@ def main():
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
+        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
@@ -574,23 +597,27 @@ def main():
    label_list = processor.get_labels()
    num_labels = len(label_list)

+    if args.patience != "0" and args.per_gpu_eval_batch_size != 1:
+        raise ValueError("The eval batch size must be 1 with PABEE inference on.")
+
    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
-    config = AutoConfig.from_pretrained(
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
-    tokenizer = AutoTokenizer.from_pretrained(
+    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
-    model = AutoModelForSequenceClassification.from_pretrained(
+    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
@@ -602,6 +629,14 @@ def main():

    model.to(args.device)

+    print("Total Model Parameters:", sum(param.numel() for param in model.parameters()))
+    output_layers_param_num = sum(param.numel() for param in model.classifiers.parameters())
+    print("Output Layers Parameters:", output_layers_param_num)
+    single_output_layer_param_num = sum(param.numel() for param in model.classifiers[0].parameters())
+    print(
+        "Added Output Layers Parameters:", output_layers_param_num - single_output_layer_param_num,
+    )
+
    logger.info("Training/evaluation parameters %s", args)

    # Training
@@ -612,10 +647,6 @@ def main():

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
@@ -629,14 +660,15 @@ def main():
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
-        model = AutoModelForSequenceClassification.from_pretrained(args.output_dir)
-        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
-        tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        patience_list = [int(x) for x in args.patience.split(",")]
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
@@ -644,16 +676,20 @@ def main():
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+
        for checkpoint in checkpoints:
+
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

-            model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
+            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
-            results.update(result)

+            print(f"Evaluation for checkpoint {prefix}")
+            for patience in patience_list:
+                result = evaluate(args, model, tokenizer, prefix=prefix, patience=patience)
+                result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
+                results.update(result)
    return results


--- a/examples/bert-loses-patience/test_run_glue_with_pabee.py
+++ b/examples/bert-loses-patience/test_run_glue_with_pabee.py
@@ -0,0 +1,48 @@
+import argparse
+import logging
+import sys
+import unittest
+from unittest.mock import patch
+
+import run_glue_with_pabee
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+
+
+def get_setup_file():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-f")
+    args = parser.parse_args()
+    return args.f
+
+
+class PabeeTests(unittest.TestCase):
+    def test_run_glue(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        testargs = """
+            run_glue_with_pabee.py
+            --model_type albert
+            --model_name_or_path albert-base-v2
+            --data_dir ./tests/fixtures/tests_samples/MRPC/
+            --task_name mrpc
+            --do_train
+            --do_eval
+            --output_dir ./tests/fixtures/tests_samples/temp_dir
+            --per_gpu_train_batch_size=2
+            --per_gpu_eval_batch_size=1
+            --learning_rate=2e-5
+            --max_steps=50
+            --warmup_steps=2
+            --overwrite_output_dir
+            --seed=42
+            --max_seq_length=128
+            """.split()
+        with patch.object(sys, "argv", testargs):
+            result = run_glue_with_pabee.main()
+            for value in result.values():
+                self.assertGreaterEqual(value, 0.75)
--- a/examples/bertology/run_bertology.py
+++ b/examples/bertology/run_bertology.py
@@ -30,10 +30,17 @@ from torch.utils.data import DataLoader, SequentialSampler, Subset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm

-from run_glue import ALL_MODELS, MODEL_CLASSES, load_and_cache_examples, set_seed
-from transformers import glue_compute_metrics as compute_metrics
-from transformers import glue_output_modes as output_modes
-from transformers import glue_processors as processors
+from transformers import (
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    GlueDataset,
+    default_data_collator,
+    glue_compute_metrics,
+    glue_output_modes,
+    glue_processors,
+    set_seed,
+)


 logger = logging.getLogger(__name__)
@@ -57,32 +64,35 @@ def print_2d_tensor(tensor):


 def compute_heads_importance(
-    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None
+    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None, actually_pruned=False
 ):
    """ This method shows how to compute:
        - head attention entropy
        - head importance scores according to http://arxiv.org/abs/1905.10650
    """
    # Prepare our tensors
-    n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads
+    n_layers, n_heads = model.config.num_hidden_layers, model.config.num_attention_heads
    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)

    if head_mask is None:
        head_mask = torch.ones(n_layers, n_heads).to(args.device)
+
    head_mask.requires_grad_(requires_grad=True)
+    # If actually pruned attention multi-head, set head mask to None to avoid shape mismatch
+    if actually_pruned:
+        head_mask = None
+
    preds = None
    labels = None
    tot_tokens = 0.0

-    for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-        batch = tuple(t.to(args.device) for t in batch)
-        input_ids, input_mask, segment_ids, label_ids = batch
+    for step, inputs in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+        for k, v in inputs.items():
+            inputs[k] = v.to(args.device)

        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
-        outputs = model(
-            input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask
-        )
+        outputs = model(**inputs, head_mask=head_mask)
        loss, logits, all_attentions = (
            outputs[0],
            outputs[1],
@@ -92,7 +102,7 @@ def compute_heads_importance(

        if compute_entropy:
            for layer, attn in enumerate(all_attentions):
-                masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1)
+                masked_entropy = entropy(attn.detach()) * inputs["attention_mask"].float().unsqueeze(1)
                attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()

        if compute_importance:
@@ -101,12 +111,12 @@ def compute_heads_importance(
        # Also store our logits/labels if we want to compute metrics afterwards
        if preds is None:
            preds = logits.detach().cpu().numpy()
-            labels = label_ids.detach().cpu().numpy()
+            labels = inputs["labels"].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-            labels = np.append(labels, label_ids.detach().cpu().numpy(), axis=0)
+            labels = np.append(labels, inputs["labels"].detach().cpu().numpy(), axis=0)

-        tot_tokens += input_mask.float().detach().sum().data
+        tot_tokens += inputs["attention_mask"].float().detach().sum().data

    # Normalize
    attn_entropy /= tot_tokens
@@ -145,7 +155,7 @@ def mask_heads(args, model, eval_dataloader):
    """
    _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-    original_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    original_score = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
    logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)

    new_head_mask = torch.ones_like(head_importance)
@@ -167,6 +177,7 @@ def mask_heads(args, model, eval_dataloader):
        new_head_mask = new_head_mask.view(-1)
        new_head_mask[current_heads_to_mask] = 0.0
        new_head_mask = new_head_mask.view_as(head_mask)
+        new_head_mask = new_head_mask.clone().detach()
        print_2d_tensor(new_head_mask)

        # Compute metric and head importance again
@@ -174,9 +185,9 @@ def mask_heads(args, model, eval_dataloader):
            args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask
        )
        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-        current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+        current_score = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
        logger.info(
-            "Masking: current score: %f, remaning heads %d (%.1f percents)",
+            "Masking: current score: %f, remaining heads %d (%.1f percents)",
            current_score,
            new_head_mask.sum(),
            new_head_mask.sum() / new_head_mask.numel() * 100,
@@ -200,21 +211,30 @@ def prune_heads(args, model, eval_dataloader, head_mask):
        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask
    )
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-    score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    score_masking = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
    original_time = datetime.now() - before_time

    original_num_params = sum(p.numel() for p in model.parameters())
-    heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask)))
+    heads_to_prune = dict(
+        (layer, (1 - head_mask[layer].long()).nonzero().squeeze().tolist()) for layer in range(len(head_mask))
+    )
+
    assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
    model.prune_heads(heads_to_prune)
    pruned_num_params = sum(p.numel() for p in model.parameters())

    before_time = datetime.now()
    _, _, preds, labels = compute_heads_importance(
-        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=None
+        args,
+        model,
+        eval_dataloader,
+        compute_entropy=False,
+        compute_importance=False,
+        head_mask=None,
+        actually_pruned=True,
    )
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-    score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    score_pruning = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
    new_time = datetime.now() - before_time

    logger.info(
@@ -242,14 +262,14 @@ def main():
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
-        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+        help="The name of the task to train selected in the list: " + ", ".join(glue_processors.keys()),
    )
    parser.add_argument(
        "--output_dir",
@@ -274,7 +294,7 @@ def main():
    )
    parser.add_argument(
        "--cache_dir",
-        default="",
+        default=None,
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )
@@ -350,48 +370,40 @@ def main():
    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1)))

    # Set seeds
-    set_seed(args)
+    set_seed(args.seed)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
-    if args.task_name not in processors:
+    if args.task_name not in glue_processors:
        raise ValueError("Task not found: %s" % (args.task_name))
-    processor = processors[args.task_name]()
-    args.output_mode = output_modes[args.task_name]
+    processor = glue_processors[args.task_name]()
+    args.output_mode = glue_output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.

-    args.model_type = ""
-    for key in MODEL_CLASSES:
-        if key in args.model_name_or_path.lower():
-            args.model_type = key  # take the first match in model types
-            break
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
+    config = AutoConfig.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        output_attentions=True,
-        cache_dir=args.cache_dir if args.cache_dir else None,
+        cache_dir=args.cache_dir,
    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        cache_dir=args.cache_dir if args.cache_dir else None,
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir,
    )
-    model = model_class.from_pretrained(
+    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
+        cache_dir=args.cache_dir,
    )

-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
    # Distributed and parallel training
    model.to(args.device)
    if args.local_rank != -1:
@@ -402,15 +414,18 @@ def main():
        model = torch.nn.DataParallel(model)

    # Print/save training arguments
+    os.makedirs(args.output_dir, exist_ok=True)
    torch.save(args, os.path.join(args.output_dir, "run_args.bin"))
    logger.info("Training/evaluation parameters %s", args)

    # Prepare dataset for the GLUE task
-    eval_data = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)
+    eval_dataset = GlueDataset(args, tokenizer=tokenizer, mode="dev")
    if args.data_subset > 0:
-        eval_data = Subset(eval_data, list(range(min(args.data_subset, len(eval_data)))))
-    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
-    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
+        eval_dataset = Subset(eval_dataset, list(range(min(args.data_subset, len(eval_dataset)))))
+    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+    eval_dataloader = DataLoader(
+        eval_dataset, sampler=eval_sampler, batch_size=args.batch_size, collate_fn=default_data_collator
+    )

    # Compute head entropy and importance score
    compute_heads_importance(args, model, eval_dataloader)
--- a/examples/contrib/mm-imdb/README.md
+++ b/examples/contrib/mm-imdb/README.md
@@ -0,0 +1,23 @@
+## MM-IMDb
+
+Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/contrib/mm-imdb/run_mmimdb.py).
+
+[MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata.
+
+### Training on MM-IMDb
+
+```
+python run_mmimdb.py \
+    --data_dir /path/to/mmimdb/dataset/ \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --output_dir /path/to/save/dir/ \
+    --do_train \
+    --do_eval \
+    --max_seq_len 512 \
+    --gradient_accumulation_steps 20 \
+    --num_image_embeds 3 \
+    --num_train_epochs 100 \
+    --patience 5
+```
+
--- a/examples/contrib/mm-imdb/run_mmimdb.py
+++ b/examples/contrib/mm-imdb/run_mmimdb.py
@@ -34,26 +34,11 @@ from tqdm import tqdm, trange
 from transformers import (
    WEIGHTS_NAME,
    AdamW,
-    AlbertConfig,
-    AlbertModel,
-    AlbertTokenizer,
-    BertConfig,
-    BertModel,
-    BertTokenizer,
-    DistilBertConfig,
-    DistilBertModel,
-    DistilBertTokenizer,
+    AutoConfig,
+    AutoModel,
+    AutoTokenizer,
    MMBTConfig,
    MMBTForClassification,
-    RobertaConfig,
-    RobertaModel,
-    RobertaTokenizer,
-    XLMConfig,
-    XLMModel,
-    XLMTokenizer,
-    XLNetConfig,
-    XLNetModel,
-    XLNetTokenizer,
    get_linear_schedule_with_warmup,
 )
 from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_transforms, get_mmimdb_labels
@@ -67,23 +52,6 @@ except ImportError:

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum(
-    (
-        tuple(conf.pretrained_config_archive_map.keys())
-        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
-    ),
-    (),
-)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertModel, BertTokenizer),
-    "xlnet": (XLNetConfig, XLNetModel, XLNetTokenizer),
-    "xlm": (XLMConfig, XLMModel, XLMTokenizer),
-    "roberta": (RobertaConfig, RobertaModel, RobertaTokenizer),
-    "distilbert": (DistilBertConfig, DistilBertModel, DistilBertTokenizer),
-    "albert": (AlbertConfig, AlbertModel, AlbertTokenizer),
-}
-

 def set_seed(args):
    random.seed(args.seed)
@@ -278,7 +246,7 @@ def evaluate(args, model, tokenizer, criterion, prefix=""):
    )

    # multi-gpu eval
-    if args.n_gpu > 1:
+    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
@@ -351,19 +319,12 @@ def main():
        required=True,
        help="The input data dir. Should contain the .jsonl files for MMIMDB.",
    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--output_dir",
@@ -385,7 +346,7 @@ def main():
    )
    parser.add_argument(
        "--cache_dir",
-        default="",
+        default=None,
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )
@@ -526,18 +487,14 @@ def main():
    # Setup model
    labels = get_mmimdb_labels()
    num_labels = len(labels)
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    transformer_config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path
-    )
-    tokenizer = tokenizer_class.from_pretrained(
+    transformer_config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
+        cache_dir=args.cache_dir,
    )
-    transformer = model_class.from_pretrained(
-        args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir if args.cache_dir else None
+    transformer = AutoModel.from_pretrained(
+        args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir
    )
    img_encoder = ImageEncoder(args)
    config = MMBTConfig(transformer_config, num_labels=num_labels)
@@ -564,10 +521,6 @@ def main():

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
@@ -583,13 +536,12 @@ def main():
        # Load a trained model and vocabulary that you have fine-tuned
        model = MMBTForClassification(config, transformer, img_encoder)
        model.load_state_dict(torch.load(os.path.join(args.output_dir, WEIGHTS_NAME)))
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
--- a/examples/contrib/mm-imdb/utils_mmimdb.py
+++ b/examples/contrib/mm-imdb/utils_mmimdb.py
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -31,14 +31,8 @@ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Tenso
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange

-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    BertConfig,
-    BertForMultipleChoice,
-    BertTokenizer,
-    get_linear_schedule_with_warmup,
-)
+from transformers import WEIGHTS_NAME, AdamW, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup
+from transformers.modeling_auto import AutoModelForMultipleChoice


 try:
@@ -49,12 +43,6 @@ except ImportError:

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in [BertConfig]), ())
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
-}
-

 class SwagExample(object):
    """A single training/test example for the SWAG dataset."""
@@ -395,8 +383,6 @@ def train(args, train_dataset, model, tokenizer):
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
@@ -492,19 +478,12 @@ def main():
        required=True,
        help="SWAG csv for predictions. E.g., val.csv or test.csv",
    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--output_dir",
@@ -536,9 +515,6 @@ def main():
    parser.add_argument(
        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
@@ -652,13 +628,9 @@ def main():
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case
-    )
-    model = model_class.from_pretrained(
+    config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,)
+    model = AutoModelForMultipleChoice.from_pretrained(
        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
    )

@@ -677,10 +649,6 @@ def main():

    # Save the trained model and the tokenizer
    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
@@ -694,8 +662,8 @@ def main():
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model = AutoModelForMultipleChoice.from_pretrained(args.output_dir)
+        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
@@ -718,8 +686,8 @@ def main():
        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            model = model_class.from_pretrained(checkpoint)
-            tokenizer = tokenizer_class.from_pretrained(checkpoint)
+            model = AutoModelForMultipleChoice.from_pretrained(checkpoint)
+            tokenizer = AutoTokenizer.from_pretrained(checkpoint)
            model.to(args.device)

            # Evaluate
--- a/examples/contrib/run_transfo_xl.py
+++ b/examples/contrib/run_transfo_xl.py
@@ -80,7 +80,7 @@ def main():

    # Load a pre-trained model
    model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
-    model = model.to(device)
+    model.to(device)

    logger.info(
        "Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}".format(
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -80,7 +80,7 @@ class Distiller:

        self.mlm = params.mlm
        if self.mlm:
-            logger.info(f"Using MLM loss for LM step.")
+            logger.info("Using MLM loss for LM step.")
            self.mlm_mask_prop = params.mlm_mask_prop
            assert 0.0 <= self.mlm_mask_prop <= 1.0
            assert params.word_mask + params.word_keep + params.word_rand == 1.0
@@ -91,7 +91,7 @@ class Distiller:
                self.pred_probs = self.pred_probs.half()
                self.token_probs = self.token_probs.half()
        else:
-            logger.info(f"Using CLM loss for LM step.")
+            logger.info("Using CLM loss for LM step.")

        self.epoch = 0
        self.n_iter = 0
@@ -365,8 +365,8 @@ class Distiller:
            self.end_epoch()

        if self.is_master:
-            logger.info(f"Save very last checkpoint as `pytorch_model.bin`.")
-            self.save_checkpoint(checkpoint_name=f"pytorch_model.bin")
+            logger.info("Save very last checkpoint as `pytorch_model.bin`.")
+            self.save_checkpoint(checkpoint_name="pytorch_model.bin")
            logger.info("Training is finished")

    def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor):
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" This is the exact same script as `examples/run_squad.py` (as of 2020, January 8th) with an additional and optional step of distillation."""
+""" This is the exact same script as `examples/question-answering/run_squad.py` (as of 2020, January 8th) with an additional and optional step of distillation."""

 import argparse
 import glob
@@ -67,9 +67,6 @@ except ImportError:

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ()
-)

 MODEL_CLASSES = {
    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
@@ -505,7 +502,7 @@ def main():
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--output_dir",
@@ -812,10 +809,6 @@ def main():

    # Save the trained model and the tokenizer
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -60,7 +60,7 @@ def main():
    with open(args.file_path, "r", encoding="utf8") as fp:
        data = fp.readlines()

-    logger.info(f"Start encoding")
+    logger.info("Start encoding")
    logger.info(f"{len(data)} examples to process.")

    rslt = []
--- a/examples/distillation/scripts/extract.py
+++ b/examples/distillation/scripts/extract.py
@@ -93,7 +93,7 @@ if __name__ == "__main__":
    elif args.model_type == "gpt2":
        for w in ["weight", "bias"]:
            compressed_sd[f"{prefix}.ln_f.{w}"] = state_dict[f"{prefix}.ln_f.{w}"]
-        compressed_sd[f"lm_head.weight"] = state_dict[f"lm_head.weight"]
+        compressed_sd["lm_head.weight"] = state_dict["lm_head.weight"]

    print(f"N layers selected for distillation: {std_idx}")
    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
--- a/examples/distillation/scripts/extract_distilbert.py
+++ b/examples/distillation/scripts/extract_distilbert.py
@@ -37,7 +37,7 @@ if __name__ == "__main__":
        model = BertForMaskedLM.from_pretrained(args.model_name)
        prefix = "bert"
    else:
-        raise ValueError(f'args.model_type should be "bert".')
+        raise ValueError('args.model_type should be "bert".')

    state_dict = model.state_dict()
    compressed_sd = {}
@@ -78,8 +78,8 @@ if __name__ == "__main__":
            ]
        std_idx += 1

-    compressed_sd[f"vocab_projector.weight"] = state_dict[f"cls.predictions.decoder.weight"]
-    compressed_sd[f"vocab_projector.bias"] = state_dict[f"cls.predictions.bias"]
+    compressed_sd["vocab_projector.weight"] = state_dict["cls.predictions.decoder.weight"]
+    compressed_sd["vocab_projector.bias"] = state_dict["cls.predictions.bias"]
    if args.vocab_transform:
        for w in ["weight", "bias"]:
            compressed_sd[f"vocab_transform.{w}"] = state_dict[f"cls.predictions.transform.dense.{w}"]
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@@ -273,7 +273,7 @@ def main():
        token_probs = None

    train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
-    logger.info(f"Data loader created.")
+    logger.info("Data loader created.")

    # STUDENT #
    logger.info(f"Loading student config from {args.student_config}")
@@ -288,7 +288,7 @@ def main():

    if args.n_gpu > 0:
        student.to(f"cuda:{args.local_rank}")
-    logger.info(f"Student loaded.")
+    logger.info("Student loaded.")

    # TEACHER #
    teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
--- a/Show More
+++ b/Show More