Fix CI after killing archive maps (#4724 )

* 🐛 Fix model ids for BART and Flaubert
Release: v2.11.0
2020-06-02 10:21:09 -04:00 · 2020-06-02 09:49:09 -04:00 · 2020-06-02 09:39:33 -04:00 · 2020-06-02 11:03:46 +02:00 · 2020-06-02 11:02:27 +02:00 · 2020-06-02 04:29:28 -04:00
671 changed files with 66230 additions and 20241 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -3,31 +3,18 @@ jobs:
    run_tests_torch_and_tf:
        working_directory: ~/transformers
        docker:
-            - image: circleci/python:3.5
+            - image: circleci/python:3.6
        environment:
            OMP_NUM_THREADS: 1
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install .[sklearn,tf,torch,testing]
+            - run: sudo pip install .[sklearn,tf-cpu,torch,testing]
            - run: sudo pip install codecov pytest-cov
            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
            - run: codecov
-    run_all_tests_torch_and_tf:
-        working_directory: ~/transformers
-        docker:
-            - image: circleci/python:3.5
-        environment:
-            OMP_NUM_THREADS: 1
-            RUN_SLOW: yes
-            RUN_CUSTOM_TOKENIZERS: yes
-        resource_class: xlarge
-        parallelism: 1
-        steps:
-            - checkout
-            - run: sudo pip install .[mecab,sklearn,tf,torch,testing]
-            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/
+
    run_tests_torch:
        working_directory: ~/transformers
        docker:
@@ -52,14 +39,14 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install .[sklearn,tf,testing]
+            - run: sudo pip install .[sklearn,tf-cpu,testing]
            - run: sudo pip install codecov pytest-cov
            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
            - run: codecov
    run_tests_custom_tokenizers:
        working_directory: ~/transformers
        docker:
-            - image: circleci/python:3.5
+            - image: circleci/python:3.6
        environment:
            RUN_CUSTOM_TOKENIZERS: yes
        steps:
@@ -69,7 +56,7 @@ jobs:
    run_examples_torch:
        working_directory: ~/transformers
        docker:
-            - image: circleci/python:3.5
+            - image: circleci/python:3.6
        environment:
            OMP_NUM_THREADS: 1
        resource_class: xlarge
@@ -79,10 +66,20 @@ jobs:
            - run: sudo pip install .[sklearn,torch,testing]
            - run: sudo pip install -r examples/requirements.txt
            - run: python -m pytest -n 8 --dist=loadfile -s -v ./examples/
+    build_doc:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        steps:
+            - checkout
+            - run: sudo pip install .[tf,torch,docs]
+            - run: cd docs && make html
+            - store_artifacts:
+                path: ./docs/_build
    deploy_doc:
        working_directory: ~/transformers
        docker:
-            - image: circleci/python:3.5
+            - image: circleci/python:3.6
        steps:
            - add_ssh_keys:
                fingerprints:
@@ -107,7 +104,7 @@ jobs:
    check_repository_consistency:
        working_directory: ~/transformers
        docker:
-            - image: circleci/python:3.5
+            - image: circleci/python:3.6
        resource_class: small
        parallelism: 1
        steps:
@@ -130,14 +127,5 @@ workflows:
            - run_tests_torch_and_tf
            - run_tests_torch
            - run_tests_tf
+            - build_doc
            - deploy_doc: *workflow_filters
-    run_slow_tests:
-        triggers:
-            - schedule:
-                cron: "0 4 * * 1"
-                filters:
-                    branches:
-                        only:
-                            - master
-        jobs:
-            - run_all_tests_torch_and_tf
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -25,4 +25,5 @@ deploy_doc "fc9faa8" v2.0.0
 deploy_doc "3ddce1d" v2.1.1
 deploy_doc "3616209" v2.2.0
 deploy_doc "d0f8b9a" v2.3.0
-deploy_doc "6664ea9" v2.4.0
+deploy_doc "6664ea9" v2.4.0
+deploy_doc "fb560dc" v2.5.0
--- a/.github/ISSUE_TEMPLATE/--new-model-addition.md
+++ b/.github/ISSUE_TEMPLATE/--new-model-addition.md
@@ -2,7 +2,7 @@
 name: "\U0001F31F New model addition"
 about: Submit a proposal/request to implement a new Transformer-based model
 title: ''
-labels: ''
+labels: New model
 assignees: ''

 ---
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -39,12 +39,14 @@ Steps to reproduce the behavior:

 <!-- A clear and concise description of what you would expect to happen. -->

-## Environment
-
-* OS:
-* Python version:
-* PyTorch version:
-* `transformers` version (or branch):
-* Using GPU ?
-* Distributed or parallel setup ?
-* Any other relevant information:
+## Environment info
+<!-- You can run the command `transformers-cli env` and copy-and-paste its output below.
+     Don't forget to fill out the missing fields in that output! -->
+     
+- `transformers` version:
+- Platform:
+- Python version:
+- PyTorch version (GPU?):
+- Tensorflow version (GPU?):
+- Using GPU in script?:
+- Using distributed or parallel set-up in script?:
--- a/.github/ISSUE_TEMPLATE/migration.md
+++ b/.github/ISSUE_TEMPLATE/migration.md
@@ -1,8 +1,9 @@
 ---
 name: "\U0001F4DA Migration from pytorch-pretrained-bert or pytorch-transformers"
-about: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers to transformers
+about: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers
+  to transformers
 title: ''
-labels: ''
+labels: Migration
 assignees: ''

 ---
@@ -33,16 +34,21 @@ The tasks I am working on is:
    Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
    -->

-## Environment
+## Environment info
+<!-- You can run the command `python transformers-cli env` and copy-and-paste its output below.
+     Don't forget to fill out the missing fields in that output! -->
+ 
+- `transformers` version:
+- Platform:
+- Python version:
+- PyTorch version (GPU?):
+- Tensorflow version (GPU?):
+- Using GPU in script?:
+- Using distributed or parallel set-up in script?:

-* OS:
-* Python version:
-* PyTorch version:
+<!-- IMPORTANT: which version of the former library do you use? -->
 * `pytorch-transformers` or `pytorch-pretrained-bert` version (or branch):
-* `transformers` version (or branch):
-* Using GPU?
-* Distributed or parallel setup?
-* Any other relevant information:
+

 ## Checklist

--- a/.github/ISSUE_TEMPLATE/question-help.md
+++ b/.github/ISSUE_TEMPLATE/question-help.md
@@ -26,4 +26,4 @@ assignees: ''

 <!-- You should first ask your question on SO, and only if
     you didn't get an answer ask it here on GitHub. -->
-**A link to original question on Stack Overflow**: 
+**A link to original question on Stack Overflow**:
--- a/.github/workflows/github-push.yml
+++ b/.github/workflows/github-push.yml
@@ -0,0 +1,19 @@
+name: GitHub-hosted runner
+
+on: push
+
+jobs:
+  check_code_quality:
+    runs-on: ubuntu-18.04
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.7
+    # - name: Install dependencies
+    #   run: |
+    #     pip install .[tf,torch,quality]
+
+
+
--- a/.github/workflows/github-torch-hub.yml
+++ b/.github/workflows/github-torch-hub.yml
@@ -0,0 +1,32 @@
+name: Torch hub integration
+
+on: 
+  push:
+    branches:
+      - "*"
+
+jobs:
+  torch_hub_integration:
+    runs-on: ubuntu-latest
+    steps:
+    # no checkout necessary here.
+    - name: Extract branch name
+      run: echo "::set-env name=BRANCH::${GITHUB_REF#refs/heads/}"
+    - name: Check branch name
+      run: echo $BRANCH
+    - name: Set up Python
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.7
+    - name: Install dependencies
+      run: |
+        pip install torch
+        pip install numpy tokenizers filelock requests tqdm regex sentencepiece sacremoses packaging
+
+    - name: Torch hub list
+      run: |
+        python -c "import torch; print(torch.hub.list('huggingface/transformers:$BRANCH'))"
+
+    - name: Torch hub help
+      run: |
+        python -c "import torch; print(torch.hub.help('huggingface/transformers:$BRANCH', 'modelForSequenceClassification'))"
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -0,0 +1,54 @@
+name: Self-hosted runner (push)
+
+on: 
+  push:
+    branches:
+      - master
+    paths: 
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+  # pull_request:
+  repository_dispatch:
+
+
+jobs:
+  run_tests_torch_and_tf_gpu:
+    runs-on: self-hosted
+    steps:
+    - uses: actions/checkout@v2
+    - name: Python version
+      run: |
+        which python
+        python --version
+        pip --version
+    - name: Current dir
+      run: pwd
+    - run: nvidia-smi
+    - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+      run: |
+        python -m venv .env
+        source .env/bin/activate
+        which python
+        python --version
+        pip --version
+    - name: Install dependencies
+      run: |
+        source .env/bin/activate
+        pip install torch
+        pip install .[sklearn,testing]
+
+    - name: Are GPUs recognized by our DL frameworks
+      run: |
+        source .env/bin/activate
+        python -c "import torch; print(torch.cuda.is_available())"
+
+    - name: Run all non-slow tests on GPU
+      env:
+        TF_FORCE_GPU_ALLOW_GROWTH: "true"
+        # TF_GPU_MEMORY_LIMIT: 4096
+        OMP_NUM_THREADS: 1
+        USE_CUDA: yes
+      run: |
+        source .env/bin/activate
+        python -m pytest -n 2 --dist=loadfile -s -v ./tests/
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -0,0 +1,50 @@
+name: Self-hosted runner (scheduled)
+
+on:
+  push:
+    branches:
+      - ci_*
+  repository_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+jobs:
+  run_all_tests_torch_and_tf_gpu:
+    runs-on: self-hosted
+    steps:
+    - uses: actions/checkout@v2
+    - name: Python version
+      run: |
+        which python
+        python --version
+        pip --version
+    - name: Current dir
+      run: pwd
+    - run: nvidia-smi
+    - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+      run: |
+        python -m venv .env
+        source .env/bin/activate
+        which python
+        python --version
+        pip --version
+    - name: Install dependencies
+      run: |
+        source .env/bin/activate
+        pip install .[sklearn,torch,testing]
+
+    - name: Are GPUs recognized by our DL frameworks
+      run: |
+        source .env/bin/activate
+        python -c "import torch; print(torch.cuda.is_available())"
+
+    - name: Run all tests on GPU
+      env:
+        TF_FORCE_GPU_ALLOW_GROWTH: "true"
+        OMP_NUM_THREADS: 1
+        RUN_SLOW: yes
+        USE_CUDA: yes
+      run: |
+        source .env/bin/activate
+        python -m pytest -n 1 --dist=loadfile -s -v ./tests/
+        
--- a/.gitignore
+++ b/.gitignore
@@ -130,7 +130,10 @@ proc_data

 # examples
 runs
-examples/runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args

 # data
 /data
@@ -139,3 +142,9 @@ serialization_dir
 # emacs
 *.*~
 debug.env
+
+# vim
+.*.swp
+
+#ctags
+tags
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -41,16 +41,19 @@ Did not find it? :( So we can act quickly on it, please follow these steps:
  less than 30s;
 * Provide the *full* traceback if an exception is raised.

-To get the OS and software versions, execute the following code and copy-paste
-the output:
+To get the OS and software versions automatically, you can run the following command:

+```bash
+transformers-cli env
 ```
-import platform; print("Platform", platform.platform())
-import sys; print("Python", sys.version)
-import torch; print("PyTorch", torch.__version__)
-import tensorflow; print("Tensorflow", tensorflow.__version__)
+
+or from the root of the repository the following command:
+
+```bash
+python src/transformers/commands/transformers_cli.py env
 ```

+
 ### Do you want to implement a new model?

 Awesome! Please provide the following information:
@@ -134,7 +137,6 @@ Follow these steps to start contributing:
   ```bash
   $ pip install -U git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
   ```
-
 5. Develop the features on your branch.

   As you work on the features, you should make sure that the test suite
@@ -202,10 +204,13 @@ Follow these steps to start contributing:
 3. To indicate a work in progress please prefix the title with `[WIP]`. These
   are useful to avoid duplicated work, and to differentiate it from PRs ready
   to be merged;
-4. Make sure pre-existing tests still pass;
-5. Add high-coverage tests. No quality test, no merge;
-6. All public methods must have informative docstrings;
-
+4. Make sure existing tests pass;
+5. Add high-coverage tests. No quality testing = no merge. 
+ - If you are adding a new model, make sure that you use `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)`, which triggers the common tests.
+ - If you are adding new `@slow` tests, make sure they pass using `RUN_SLOW=1 python -m pytest tests/test_my_new_model.py`. 
+ - If you are adding a new tokenizer, write tests, and make sure `RUN_SLOW=1 python -m pytest tests/test_tokenization_{your_model_name}.py` passes.
+CircleCI does not run them. 
+6. All public methods must have informative docstrings that work nicely with sphinx. See `modeling_ctrl.py` for an example.

 ### Tests

--- a/README.md
+++ b/README.md
@@ -19,15 +19,14 @@
 </p>

 <h3 align="center">
-<p>State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
+<p>State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0
 </h3>

-🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet, CTRL...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
+🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet, T5, CTRL...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over thousands of pretrained models in 100+ languages and deep interoperability between PyTorch & TensorFlow 2.0.
+
+[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/0)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/0)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/1)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/1)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/2)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/2)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/3)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/3)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/4)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/4)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/5)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/5)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/6)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/6)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/7)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/7)

 ### Features
-
- As easy to use as pytorch-transformers
- As powerful and concise as Keras
 - High performance on NLU and NLG tasks
 - Low barrier to entry for educators and practitioners

@@ -39,7 +38,7 @@ State-of-the-art NLP for everyone
 Lower compute costs, smaller carbon footprint
 - Researchers can share trained models instead of always retraining
 - Practitioners can reduce compute time and production costs
- 10 architectures with over 30 pretrained models, some in more than 100 languages
+- Dozens of architectures with over 1,000 pretrained models, some in more than 100 languages

 Choose the right framework for every part of a model's lifetime
 - Train state-of-the-art models in 3 lines of code
@@ -60,11 +59,11 @@ Choose the right framework for every part of a model's lifetime
 | [Quick tour: Share your models ](#Quick-tour-of-model-sharing) | Upload and share your fine-tuned models with the community |
 | [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
 | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
-| [Documentation][(v2.4.0)](https://huggingface.co/transformers/v2.4.0)[(v2.3.0)](https://huggingface.co/transformers/v2.3.0)[(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
+| [Documentation][(v2.5.0)](https://huggingface.co/transformers/v2.5.0)[(v2.4.0/v2.4.1)](https://huggingface.co/transformers/v2.4.0)[(v2.3.0)](https://huggingface.co/transformers/v2.3.0)[(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |

 ## Installation

-This repo is tested on Python 3.5+, PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
+This repo is tested on Python 3.6+, PyTorch 1.0.0+ (PyTorch 1.3.1+ for examples) and TensorFlow 2.0.

 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).

@@ -146,23 +145,29 @@ At some point in the future, you'll be able to seamlessly move from pre-training

 🤗 Transformers currently provides the following NLU/NLG architectures:

-1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-3. **[GPT-2](https://blog.openai.com/better-language-models/)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
-9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-11. **[ALBERT](https://github.com/google-research/ALBERT)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-12. **[T5](https://github.com/google-research/text-to-text-transfer-transformer)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-13. **[XLM-RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/xlmr)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+2. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+3. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+4. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+5. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+6. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+7. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+8. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
+9. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+10. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+11. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+12. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+13. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 14. **[MMBT](https://github.com/facebookresearch/mmbt/)** (from Facebook), released together with the paper a [Supervised Multimodal Bitransformers for Classifying Images and Text](https://arxiv.org/pdf/1909.02950.pdf) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
-15. **[FlauBERT](https://github.com/getalp/Flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-16. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
-17. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
+15. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+16. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+17. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+18. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+19. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+20. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+21. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+22. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
+23. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.

 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).

@@ -193,7 +198,7 @@ MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
-          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
+          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'),
          (RobertaModel,    RobertaTokenizer,    'roberta-base'),
          (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
         ]
@@ -301,8 +306,9 @@ setup your environment to run the examples.

 The library comprises several example scripts with SOTA performances for NLU and NLG tasks:

- `run_glue.py`: an example fine-tuning Bert, XLNet and XLM on nine different GLUE tasks (*sequence-level classification*)
- `run_squad.py`: an example fine-tuning Bert, XLNet and XLM on the question answering dataset SQuAD 2.0 (*token-level classification*)
+- `run_glue.py`: an example fine-tuning sequence classification models on nine different GLUE tasks (*sequence-level classification*)
+- `run_squad.py`: an example fine-tuning question answering models on the question answering dataset SQuAD 2.0 (*token-level classification*)
+- `run_ner.py`: an example fine-tuning token classification models on named entity recognition (*token-level classification*)
 - `run_generation.py`: an example using GPT, GPT-2, CTRL, Transformer-XL and XLNet for conditional language generation
 - other model-specific examples (see the documentation).

@@ -312,7 +318,7 @@ Here are three quick usage examples for these scripts:

 The [General Language Understanding Evaluation (GLUE) benchmark](https://gluebenchmark.com/) is a collection of nine sentence- or sentence-pair language understanding tasks for evaluating and analyzing natural language understanding systems.

-Before running anyone of these GLUE tasks you should download the
+Before running any of these GLUE tasks you should download the
 [GLUE data](https://gluebenchmark.com/tasks) by running
 [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
 and unpack it to some directory `$GLUE_DIR`.
@@ -327,17 +333,15 @@ pip install -r ./examples/requirements.txt
 export GLUE_DIR=/path/to/glue
 export TASK_NAME=MRPC

-python ./examples/run_glue.py \
-    --model_type bert \
+python ./examples/text-classification/run_glue.py \
    --model_name_or_path bert-base-uncased \
    --task_name $TASK_NAME \
    --do_train \
    --do_eval \
-    --do_lower_case \
    --data_dir $GLUE_DIR/$TASK_NAME \
    --max_seq_length 128 \
-    --per_gpu_eval_batch_size=8   \
-    --per_gpu_train_batch_size=8   \
+    --per_device_eval_batch_size=8   \
+    --per_device_train_batch_size=8   \
    --learning_rate 2e-5 \
    --num_train_epochs 3.0 \
    --output_dir /tmp/$TASK_NAME/
@@ -355,8 +359,7 @@ Parallel training is a simple way to use several GPUs (but is slower and less fl
 ```shell
 export GLUE_DIR=/path/to/glue

-python ./examples/run_glue.py \
-    --model_type xlnet \
+python ./examples/text-classification/run_glue.py \
    --model_name_or_path xlnet-large-cased \
    --do_train  \
    --do_eval   \
@@ -364,8 +367,8 @@ python ./examples/run_glue.py \
    --data_dir=${GLUE_DIR}/STS-B  \
    --output_dir=./proc_data/sts-b-110   \
    --max_seq_length=128   \
-    --per_gpu_eval_batch_size=8   \
-    --per_gpu_train_batch_size=8   \
+    --per_device_eval_batch_size=8   \
+    --per_device_train_batch_size=8   \
    --gradient_accumulation_steps=1 \
    --max_steps=1200  \
    --model_name=xlnet-large-cased   \
@@ -381,17 +384,15 @@ On this machine we thus have a batch size of 32, please increase `gradient_accum
 This example code fine-tunes the Bert Whole Word Masking model on the Microsoft Research Paraphrase Corpus (MRPC) corpus using distributed training on 8 V100 GPUs to reach a F1 > 92.

 ```bash
-python -m torch.distributed.launch --nproc_per_node 8 ./examples/run_glue.py   \
-    --model_type bert \
+python -m torch.distributed.launch --nproc_per_node 8 ./examples/text-classification/run_glue.py   \
    --model_name_or_path bert-large-uncased-whole-word-masking \
    --task_name MRPC \
    --do_train   \
    --do_eval   \
-    --do_lower_case   \
    --data_dir $GLUE_DIR/MRPC/   \
    --max_seq_length 128   \
-    --per_gpu_eval_batch_size=8   \
-    --per_gpu_train_batch_size=8   \
+    --per_device_eval_batch_size=8   \
+    --per_device_train_batch_size=8   \
    --learning_rate 2e-5   \
    --num_train_epochs 3.0  \
    --output_dir /tmp/mrpc_output/ \
@@ -415,12 +416,11 @@ Training with these hyper-parameters gave us the following results:
 This example code fine-tunes BERT on the SQuAD dataset using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:

 ```bash
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \
+python -m torch.distributed.launch --nproc_per_node=8 ./examples/question-answering/run_squad.py \
    --model_type bert \
    --model_name_or_path bert-large-uncased-whole-word-masking \
    --do_train \
    --do_eval \
-    --do_lower_case \
    --train_file $SQUAD_DIR/train-v1.1.json \
    --predict_file $SQUAD_DIR/dev-v1.1.json \
    --learning_rate 3e-5 \
@@ -428,8 +428,8 @@ python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \
    --max_seq_length 384 \
    --doc_stride 128 \
    --output_dir ../models/wwm_uncased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=3   \
-    --per_gpu_train_batch_size=3   \
+    --per_device_eval_batch_size=3   \
+    --per_device_train_batch_size=3   \
 ```

 Training with these hyper-parameters gave us the following results:
@@ -449,7 +449,7 @@ The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-g
 Here is how to run the script with the small version of OpenAI GPT-2 model:

 ```shell
-python ./examples/run_generation.py \
+python ./examples/text-generation/run_generation.py \
    --model_type=gpt2 \
    --length=20 \
    --model_name_or_path=gpt2 \
@@ -457,7 +457,7 @@ python ./examples/run_generation.py \

 and from the Salesforce CTRL model:
 ```shell
-python ./examples/run_generation.py \
+python ./examples/text-generation/run_generation.py \
    --model_type=ctrl \
    --length=20 \
    --model_name_or_path=ctrl \
@@ -469,7 +469,7 @@ python ./examples/run_generation.py \

 Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.

-**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
+**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Optionally, join an existing organization or create a new one. Then:

 ```shell
 transformers-cli login
@@ -488,24 +488,34 @@ transformers-cli upload ./config.json [--filename folder/foobar.json]
 # (you can optionally override its filename, which can be nested inside a folder)
 ```

-Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
+If you want your model to be namespaced by your organization name rather than your username, add the following flag to any command:
+```shell
+--organization organization_name
+```
+
+Your model will then be accessible through its identifier, a concatenation of your username (or organization name) and the folder name above:
 ```python
 "username/pretrained_model"
+# or if an org:
+"organization_name/pretrained_model"
 ```

+**Please add a README.md model card** to the repo under `model_cards/` with: model description, training params (dataset, preprocessing, hardware used, hyperparameters), evaluation results, intended uses & limitations, etc.
+
+Your model now has a page on huggingface.co/models 🔥
+
 Anyone can load it from code:
 ```python
-tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
-model = AutoModel.from_pretrained("username/pretrained_model")
+tokenizer = AutoTokenizer.from_pretrained("namespace/pretrained_model")
+model = AutoModel.from_pretrained("namespace/pretrained_model")
 ```

-Finally, list all your files on S3:
+List all your files on S3:
 ```shell
 transformers-cli s3 ls
-# List all your S3 objects.
 ```

-You can also delete files:
+You can also delete unneeded files:

 ```shell
 transformers-cli s3 rm …
@@ -524,6 +534,8 @@ You can create `Pipeline` objects for the following down-stream tasks:
 - `text-classification`: Initialize a `TextClassificationPipeline` directly, or see `sentiment-analysis` for an example.
 - `question-answering`: Provided some context and a question refering to the context, it will extract the answer to the question in the context.
 - `fill-mask`: Takes an input sequence containing a masked token (e.g. `<mask>`) and return list of most probable filled sequences, with their probabilities.
+ - `summarization`
+ - `translation_xx_to_yy`

 ```python
 from transformers import pipeline
@@ -673,7 +685,7 @@ for batch in train_data:
 ## Citation

 We now have a paper you can cite for the 🤗 Transformers library:
-```
+```bibtex
@article{Wolf2019HuggingFacesTS,
  title={HuggingFace's Transformers: State-of-the-art Natural Language Processing},
  author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R'emi Louf and Morgan Funtowicz and Jamie Brew},
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,7 +0,0 @@
-FROM pytorch/pytorch:latest
-
-RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
-
-RUN pip install transformers
-
-WORKDIR /workspace
--- a/docker/transformers-cpu/Dockerfile
+++ b/docker/transformers-cpu/Dockerfile
@@ -0,0 +1,26 @@
+FROM ubuntu:18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    jupyter \
+    tensorflow-cpu \
+    torch
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
--- a/docker/transformers-gpu/Dockerfile
+++ b/docker/transformers-gpu/Dockerfile
@@ -0,0 +1,26 @@
+FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    jupyter \
+    tensorflow \
+    torch
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
--- a/docker/transformers-pytorch-cpu/Dockerfile
+++ b/docker/transformers-pytorch-cpu/Dockerfile
@@ -0,0 +1,25 @@
+FROM ubuntu:18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    jupyter \
+    torch
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@@ -0,0 +1,25 @@
+FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    mkl \
+    torch
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
--- a/docker/transformers-tensorflow-cpu/Dockerfile
+++ b/docker/transformers-tensorflow-cpu/Dockerfile
@@ -0,0 +1,25 @@
+FROM ubuntu:18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    mkl \
+    tensorflow-cpu
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
--- a/docker/transformers-tensorflow-gpu/Dockerfile
+++ b/docker/transformers-tensorflow-gpu/Dockerfile
@@ -0,0 +1,25 @@
+FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    mkl \
+    tensorflow
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
--- a/docs/README.md
+++ b/docs/README.md
@@ -47,6 +47,8 @@ Once you have setup `sphinx`, you can build the documentation by running the fol
 make html
 ```

+A folder called ``_build/html`` should have been created. You can now open the file ``_build/html/index.html`` in your browser. 
+
 ---
 **NOTE**

@@ -65,3 +67,131 @@ It should build the static app that will be available under `/docs/_build/html`

 Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
 in the source directory. You can then link it to the toc-tree by putting the filename without the extension.
+
+## Writing Documentation - Specification
+
+The `huggingface/transformers` documentation follows the
+[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style. It is
+mostly written in ReStructuredText 
+([Sphinx simple documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html), 
+[Sourceforge complete documentation](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html))
+
+### Adding a new section
+
+A section is a page held in the `Notes` toc-tree on the documentation. Adding a new section is done in two steps:
+
+- Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
+- Link that file in `./source/index.rst` on the correct toc-tree.
+
+### Adding a new model
+
+When adding a new model:
+ 
+- Create a file `xxx.rst` under `./source/model_doc`. 
+- Link that file in `./source/index.rst` on the `model_doc` toc-tree.
+- Write a short overview of the model:
+    - Overview with paper & authors
+    - Paper abstract
+    - Tips and tricks and how to use it best
+- Add the classes that should be linked in the model. This generally includes the configuration, the tokenizer, and
+  every model of that class (the base model, alongside models with additional heads), both in PyTorch and TensorFlow.
+  The order is generally: 
+    - Configuration, 
+    - Tokenizer
+    - PyTorch base model
+    - PyTorch head models
+    - TensorFlow base model
+    - TensorFlow head models
+
+These classes should be added using the RST syntax. Usually as follows:
+```
+XXXConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XXXConfig
+    :members:
+```
+
+This will include every public method of the configuration. If for some reason you wish for a method not to be displayed
+in the documentation, you can do so by specifying which methods should be in the docs:
+
+```
+XXXTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XXXTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+```
+
+### Writing source documentation
+
+Values that should be put in `code` should either be surrounded by double backticks: \`\`like so\`\` or be written as an object
+using the :obj: syntax: :obj:\`like so\`.
+
+When mentionning a class, it is recommended to use the :class: syntax as the mentioned class will be automatically
+linked by Sphinx: :class:\`transformers.XXXClass\`
+
+When mentioning a function, it is recommended to use the :func: syntax as the mentioned method will be automatically
+linked by Sphinx: :func:\`transformers.XXXClass.method\`
+
+Links should be done as so (note the double underscore at the end): \`text for the link <./local-link-or-global-link#loc>\`__
+
+#### Defining arguments in a method
+
+Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation. 
+The argument should be followed by its type, with its shape if it is a tensor, and a line return.
+Another indentation is necessary before writing the description of the argument.
+
+Here's an example showcasing everything so far:
+
+```
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`transformers.AlbertTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+```
+
+#### Writing a multi-line code block 
+
+Multi-line code blocks can be useful for displaying examples. They are done like so:
+
+```
+Example::
+
+    # first line of code
+    # second line
+    # etc
+```
+
+The `Example` string at the beginning can be replaced by anything as long as there are two semicolons following it.
+
+#### Writing a return block
+
+Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation. 
+The first line should be the type of the return, followed by a line return. No need to indent further for the elements
+building the return.
+
+Here's an example for tuple return, comprising several objects:
+
+```
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
+        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+```
+
+Here's an example for a single value return:
+
+```
+    Returns:
+        A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+```
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -1,3 +1,25 @@
+/* Our DOM objects */
+
+.framework-selector {
+    display: flex;
+    flex-direction: row;
+    justify-content: flex-end;
+}
+
+.framework-selector > button {
+    background-color: white;
+    color: #6670FF;
+    border: 1px solid #6670FF;
+    padding: 5px;
+}
+
+.framework-selector > button.selected{
+    background-color: #6670FF;
+    color: white;
+    border: 1px solid #6670FF;
+    padding: 5px;
+}
+
 /* The literal code blocks */
 .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
    color: #6670FF;
@@ -194,3 +216,41 @@ h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{
    src: url(./Calibre-Thin.otf);
    font-weight:400;
 }
+
+
+/**
+ * Nav Links to other parts of huggingface.co
+ */
+ div.menu {
+    position: absolute;
+    top: 0;
+    right: 0;
+    padding-top: 20px;
+    padding-right: 20px;
+    z-index: 1000;
+}
+div.menu a {
+    font-size: 14px;
+    letter-spacing: 0.3px;
+    text-transform: uppercase;
+    color: white;
+    -webkit-font-smoothing: antialiased;
+    background: linear-gradient(0deg, #6671ffb8, #9a66ffb8 50%);
+    padding: 10px 16px 6px 16px;
+    border-radius: 3px;
+    margin-left: 12px;
+    position: relative;
+}
+div.menu a:active {
+    top: 1px;
+}
+@media (min-width: 768px) and (max-width: 1750px) {
+    .wy-breadcrumbs {
+        margin-top: 32px;
+    }
+}
+@media (max-width: 768px) {
+    div.menu {
+        display: none;
+    }
+}
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -58,6 +58,84 @@ function addGithubButton() {
    document.querySelector(".wy-side-nav-search .icon-home").insertAdjacentHTML('afterend', div);
 }

+function addHfMenu() {
+    const div = `
+    <div class="menu">
+        <a href="/welcome">🔥 Sign in</a>
+        <a href="/models">🚀 Models</a>
+    </div>
+    `;
+    document.body.insertAdjacentHTML('afterbegin', div);
+}
+
+function platformToggle() {
+    const codeBlocks = Array.from(document.getElementsByClassName("highlight"));
+    const pytorchIdentifier = "## PYTORCH CODE";
+    const tensorflowIdentifier = "## TENSORFLOW CODE";
+    const pytorchSpanIdentifier = `<span class="c1">${pytorchIdentifier}</span>`;
+    const tensorflowSpanIdentifier = `<span class="c1">${tensorflowIdentifier}</span>`;
+
+    const getFrameworkSpans = filteredCodeBlock => {
+        const spans = filteredCodeBlock.element.innerHTML;
+        const pytorchSpanPosition = spans.indexOf(pytorchSpanIdentifier);
+        const tensorflowSpanPosition = spans.indexOf(tensorflowSpanIdentifier);
+
+        let pytorchSpans;
+        let tensorflowSpans;
+
+        if(pytorchSpanPosition < tensorflowSpanPosition){
+            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, tensorflowSpanPosition);
+            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, spans.length);
+        }else{
+            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, pytorchSpanPosition);
+            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, spans.length);
+        }
+
+        return {
+            ...filteredCodeBlock,
+            pytorchSample: pytorchSpans ,
+            tensorflowSample: tensorflowSpans
+        }
+    };
+
+    const createFrameworkButtons = sample => {
+            const pytorchButton = document.createElement("button");
+            pytorchButton.innerText = "PyTorch";
+
+            const tensorflowButton = document.createElement("button");
+            tensorflowButton.innerText = "TensorFlow";
+
+            const selectorDiv = document.createElement("div");
+            selectorDiv.classList.add("framework-selector");
+            selectorDiv.appendChild(pytorchButton);
+            selectorDiv.appendChild(tensorflowButton);
+            sample.element.parentElement.prepend(selectorDiv);
+
+            // Init on PyTorch
+            sample.element.innerHTML = sample.pytorchSample;
+            pytorchButton.classList.add("selected");
+            tensorflowButton.classList.remove("selected");
+
+            pytorchButton.addEventListener("click", () => {
+                sample.element.innerHTML = sample.pytorchSample;
+                pytorchButton.classList.add("selected");
+                tensorflowButton.classList.remove("selected");
+            });
+            tensorflowButton.addEventListener("click", () => {
+               sample.element.innerHTML = sample.tensorflowSample;
+                tensorflowButton.classList.add("selected");
+                pytorchButton.classList.remove("selected");
+            });
+        };
+
+    codeBlocks
+        .map(element => {return {element: element.firstChild, innerText: element.innerText}})
+        .filter(codeBlock => codeBlock.innerText.includes(pytorchIdentifier) && codeBlock.innerText.includes(tensorflowIdentifier))
+        .map(getFrameworkSpans)
+        .forEach(createFrameworkButtons);
+}
+
+
 /*!
 * github-buttons v2.2.10
 * (c) 2019 なつき
@@ -74,6 +152,8 @@ function onLoad() {
    addCustomFooter();
    addGithubButton();
    parseGithubButtons();
+    addHfMenu();
+    platformToggle();
 }

 window.addEventListener("load", onLoad);
--- a/docs/source/_static/js/huggingface_logo.svg
+++ b/docs/source/_static/js/huggingface_logo.svg
--- a/docs/source/bertology.rst
+++ b/docs/source/bertology.rst
@@ -8,11 +8,11 @@ There is a growing field of study concerned with investigating the inner working
 * Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
 * What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341

-In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
+In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):


 * accessing all the hidden-states of BERT/GPT/GPT-2,
 * accessing all the attention weights for each head of BERT/GPT/GPT-2,
 * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.

-To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
+To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/bertology/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -20,13 +20,13 @@ sys.path.insert(0, os.path.abspath('../../src'))
 # -- Project information -----------------------------------------------------

 project = u'transformers'
-copyright = u'2019, huggingface'
+copyright = u'2020, huggingface'
 author = u'huggingface'

 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'2.4.1'
+release = u'2.11.0'


 # -- General configuration ---------------------------------------------------
@@ -105,6 +105,12 @@ html_static_path = ['_static']
 #
 # html_sidebars = {}

+# This must be the name of an image file (path relative to the configuration 
+# directory) that is the favicon of the docs. Modern browsers use this as 
+# the icon for tabs, windows and bookmarks. It should be a Windows-style 
+# icon file (.ico).
+html_favicon = 'favicon.ico'
+

 # -- Options for HTMLHelp output ---------------------------------------------

--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -12,7 +12,7 @@ A command-line interface is provided to convert original Bert/GPT/GPT-2/Transfor
 BERT
 ^^^^

-You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
+You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_bert_original_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py>`_ script.

 This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).

@@ -33,6 +33,26 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas

 You can download Google's pre-trained models for the conversion `here <https://github.com/google-research/bert#pre-trained-models>`__.

+ALBERT
+^^^^^^
+
+Convert TensorFlow model checkpoints of ALBERT to PyTorch using the `convert_albert_original_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py>`_ script.
+
+The CLI takes as input a TensorFlow checkpoint (three files starting with ``model.ckpt-best``\ ) and the accompanying configuration file (\ ``albert_config.json``\ ), then creates and saves a PyTorch model. To run this conversion you will need to have TensorFlow and PyTorch installed.
+
+Here is an example of the conversion process for the pre-trained ``ALBERT Base`` model:
+
+.. code-block:: shell
+
+   export ALBERT_BASE_DIR=/path/to/albert/albert_base
+
+   transformers-cli convert --model_type albert \
+     --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
+     --config $ALBERT_BASE_DIR/albert_config.json \
+     --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
+
+You can download Google's pre-trained models for the conversion `here <https://github.com/google-research/albert#pre-trained-models>`__.
+
 OpenAI GPT
 ^^^^^^^^^^

--- a/docs/source/favicon.ico
+++ b/docs/source/favicon.ico
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -143,3 +143,14 @@ positional embeddings.

 Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models
 use other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
+
+
+Feed Forward Chunking
+--------------------------
+
+In transformers two feed forward layers usually follows the self attention layer in each residual attention block. The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (*e.g.* for ``bert-base-uncased``). 
+
+For an input of size ``[batch_size, sequence_length]``, the memory required to store the intermediate feed forward embeddings ``[batch_size, sequence_length, config.intermediate_size]`` can account for a large fraction of the memory use. The authors of `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_ noticed that since the computation is independent of the ``sequence_length`` dimension, it is mathematically equivalent to compute the output embeddings of both feed forward layers ``[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n``  individually and concat them afterward to ``[batch_size, sequence_length, config.hidden_size]`` with ``n = sequence_length``, which trades increased computation time against reduced memory use, but yields a mathematically **equivalent** result.
+
+For models employing the function :func:`~.transformers.apply_chunking_to_forward`, the ``chunk_size`` defines the number of output embeddings that are computed in parallel and thus defines the trade-off between memory and time complexity. 
+If ``chunk_size`` is set to 0, no feed forward chunking is done.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -61,6 +61,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
    quickstart
    glossary
    pretrained_models
+    usage
    model_sharing
    examples
    notebooks
@@ -79,6 +80,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
    main_classes/configuration
    main_classes/model
    main_classes/tokenizer
+    main_classes/pipelines
    main_classes/optimizer_schedules
    main_classes/processors

@@ -87,6 +89,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
    :caption: Package Reference

    model_doc/auto
+    model_doc/encoderdecoder
    model_doc/bert
    model_doc/gpt
    model_doc/transformerxl
@@ -99,4 +102,11 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
    model_doc/camembert
    model_doc/albert
    model_doc/xlmroberta
-    model_doc/flaubert
+    model_doc/flaubert
+    model_doc/bart
+    model_doc/t5
+    model_doc/electra
+    model_doc/dialogpt
+    model_doc/reformer
+    model_doc/marian
+    model_doc/longformer
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -1,6 +1,6 @@
 # Installation

-Transformers is tested on Python 3.5+ and PyTorch 1.1.0
+Transformers is tested on Python 3.6+ and PyTorch 1.1.0

 ## With pip

--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@@ -14,6 +14,12 @@ The base class ``PreTrainedModel`` implements the common methods for loading/sav
 .. autoclass:: transformers.PreTrainedModel
    :members:

+``Helper Functions``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.apply_chunking_to_forward
+
+
 ``TFPreTrainedModel``
 ~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/main_classes/pipelines.rst
+++ b/docs/source/main_classes/pipelines.rst
@@ -0,0 +1,74 @@
+Pipelines
+----------------------------------------------------
+
+The pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract most
+of the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity
+Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction and Question Answering.
+
+There are two categories of pipeline abstractions to be aware about:
+
+- The :class:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines
+- The other task-specific pipelines, such as :class:`~transformers.NerPipeline`
+  or :class:`~transformers.QuestionAnsweringPipeline`
+
+The pipeline abstraction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any
+other pipeline but requires an additional argument which is the `task`.
+
+.. autoclass:: transformers.pipeline
+    :members:
+
+
+The task specific pipelines
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Parent class: Pipeline
+=========================================
+
+.. autoclass:: transformers.Pipeline
+    :members: predict, transform, save_pretrained
+
+NerPipeline
+==========================================
+
+.. autoclass:: transformers.NerPipeline
+
+TokenClassificationPipeline
+==========================================
+
+This class is an alias of the :class:`~transformers.NerPipeline` defined above. Please refer to that pipeline for
+documentation and usage examples.
+
+FillMaskPipeline
+==========================================
+
+.. autoclass:: transformers.FillMaskPipeline
+
+FeatureExtractionPipeline
+==========================================
+
+.. autoclass:: transformers.FeatureExtractionPipeline
+
+TextClassificationPipeline
+==========================================
+
+.. autoclass:: transformers.TextClassificationPipeline
+
+QuestionAnsweringPipeline
+==========================================
+
+.. autoclass:: transformers.QuestionAnsweringPipeline
+
+
+SummarizationPipeline
+==========================================
+
+.. autoclass:: transformers.SummarizationPipeline
+
+
+TextGenerationPipeline
+==========================================
+
+.. autoclass:: transformers.TextGenerationPipeline
--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@@ -54,7 +54,7 @@ Additionally, the following method  can be used to load values from a data file
 Example usage
 ^^^^^^^^^^^^^^^^^^^^^^^^^

-An example using these processors is given in the `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
+An example using these processors is given in the `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/text-classification/run_glue.py>`__ script.


 XNLI
@@ -63,7 +63,7 @@ XNLI
 `The Cross-Lingual NLI Corpus (XNLI) <https://www.nyu.edu/projects/bowman/xnli/>`__ is a benchmark that evaluates
 the quality of cross-lingual text representations. 
 XNLI is crowd-sourced dataset based on `MultiNLI <http://www.nyu.edu/projects/bowman/multinli/>`: pairs of text are labeled with textual entailment 
-annotations for 15 different languages (including both high-ressource language such as English and low-ressource languages such as Swahili).
+annotations for 15 different languages (including both high-resource language such as English and low-resource languages such as Swahili).

 It was released together with the paper
 `XNLI: Evaluating Cross-lingual Sentence Representations <https://arxiv.org/abs/1809.05053>`__
@@ -74,7 +74,7 @@ This library hosts the processor to load the XNLI data:
 Please note that since the gold labels are available on the test set, evaluation is performed on the test set.

 An example using these processors is given in the
-`run_xnli.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_xnli.py>`__ script.
+`run_xnli.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/text-classification/run_xnli.py>`__ script.


 SQuAD
@@ -150,4 +150,4 @@ Example::


 Another example using these processors is given in the
-`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/run_squad.py>`__ script.
+`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/question-answering/run_squad.py>`__ script.
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -1,16 +1,38 @@
 Tokenizer
 ----------------------------------------------------

-The base class ``PreTrainedTokenizer`` implements the common methods for loading/saving a tokenizer either from a local file or directory, or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository).
+A tokenizer is in charge of preparing the inputs for a model. The library comprise tokenizers for all the models. Most of the tokenizers are available in two flavors: a full python implementation and a "Fast" implementation based on the Rust library `tokenizers`. The "Fast" implementations allows (1) a significant speed-up in particular when doing batched tokenization and (2) additional methods to map between the original string (character and words) and the token space (e.g. getting the index of the token comprising a given character or the span of characters corresponding to a given token). Currently no "Fast" implementation is available for the SentencePiece-based tokenizers (for T5, ALBERT, CamemBERT, XLMRoBERTa and XLNet models).

-``PreTrainedTokenizer`` is the main entry point into tokenizers as it also implements the main methods for using all the tokenizers:
+The base classes ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` implements the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and "Fast" tokenizers either from a local file or directory or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository).

- tokenizing, converting tokens to ids and back and encoding/decoding,
+``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` thus implements the main methods for using all the tokenizers:
+
+- tokenizing (spliting strings in sub-word token strings), converting tokens strings to ids and back, and encoding/decoding (i.e. tokenizing + convert to integers),
 - adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...),
- managing special tokens (adding them, assigning them to roles, making sure they are not split during tokenization)
+- managing special tokens like mask, beginning-of-sentence, etc tokens (adding them, assigning them to attributes in the tokenizer for easy access and making sure they are not split during tokenization)
+
+``BatchEncoding`` holds the output of the tokenizer's encoding methods (``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python tokenizer, this class behave just like a standard python dictionary and hold the various model inputs computed by these methodes (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e. backed by HuggingFace tokenizers library), this class provides in addition several advanced alignement methods which can be used to map between the original string (character and words) and the token space (e.g. getting the index of the token comprising a given character or the span of characters corresponding to a given token).

 ``PreTrainedTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PreTrainedTokenizer
    :members:
+
+``PreTrainedTokenizerFast``
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.PreTrainedTokenizerFast
+    :members:
+
+``BatchEncoding``
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BatchEncoding
+    :members:
+
+``SpecialTokensMixin``
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SpecialTokensMixin
+    :members:
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -1,5 +1,18 @@
-# Migrating from pytorch-pretrained-bert
+# Migrating from previous packages

+## Migrating from pytorch-transformers to transformers
+
+Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
+
+### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
+
+To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
+
+If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
+
+If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
+
+## Migrating from pytorch-pretrained-bert

 Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`

@@ -27,7 +40,7 @@ loss = outputs[0]
 # In transformers you can also have access to the logits:
 loss, logits = outputs[:2]

-# And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
+# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
 model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=True)
 outputs = model(input_ids, labels=labels)
 loss, logits, attentions = outputs
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@@ -6,7 +6,7 @@ Overview

 The ALBERT model was proposed in `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_
 by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
-two parameter-reduction techniques to lower memory consumption and increase the trainig speed of BERT:
+two parameter-reduction techniques to lower memory consumption and increase the training speed of BERT:

 - Splitting the embedding matrix into two smaller matrices
 - Using repeating layers split among groups
@@ -30,6 +30,8 @@ Tips:
  similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same
  number of (repeating) layers.

+The original code can be found `here <https://github.com/google-research/ALBERT>`_.
+
 AlbertConfig
 ~~~~~~~~~~~~~~~~~~~~~

@@ -41,7 +43,8 @@ AlbertTokenizer
 ~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.AlbertTokenizer
-    :members:
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary


 AlbertModel
@@ -91,3 +94,17 @@ TFAlbertForSequenceClassification

 .. autoclass:: transformers.TFAlbertForSequenceClassification
    :members:
+
+
+TFAlbertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForMultipleChoice
+    :members:
+
+
+TFAlbertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/bart.rst
+++ b/docs/source/model_doc/bart.rst
@@ -0,0 +1,56 @@
+Bart
+----------------------------------------------------
+**DISCLAIMER:** If you see something strange,
+file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
+@sshleifer
+
+Paper
+~~~~~
+The Bart model was `proposed <https://arxiv.org/abs/1910.13461>`_ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019.
+According to the abstract,
+
+- Bart uses a standard seq2seq/machine translation architecture with a bidirectional encoder (like BERT) and a left-to-right decoder (like GPT).
+- The pretraining task involves randomly shuffling the order of the original sentences and a novel in-filling scheme, where spans of text are replaced with a single mask token.
+- BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains of up to 6 ROUGE.
+
+The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_
+
+
+Implementation Notes
+~~~~~~~~~~~~~~~~~~~~
+- Bart doesn't use :obj:`token_type_ids` for sequence classification. Use BartTokenizer.encode to get the proper splitting.
+- The forward pass of ``BartModel`` will create decoder inputs (using the helper function ``transformers.modeling_bart._prepare_bart_decoder_inputs``)  if they are not passed. This is different than some other modeling APIs.
+- Model predictions are intended to be identical to the original implementation. This only works, however, if the string you pass to ``fairseq.encode`` starts with a space.
+- ``BartForConditionalGeneration.generate`` should be used for conditional generation tasks like summarization, see the example in that docstrings
+- Models that load the ``"facebook/bart-large-cnn"`` weights will not have a ``mask_token_id``, or be able to perform mask filling tasks.
+
+
+
+BartModel
+~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartModel
+    :members: forward
+
+.. autofunction:: transformers.modeling_bart._prepare_bart_decoder_inputs
+
+
+BartForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartForConditionalGeneration
+    :members: generate, forward
+
+
+BartForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartForSequenceClassification
+    :members: forward
+
+BartConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartConfig
+    :members:
+
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -35,6 +35,8 @@ Tips:
  prediction rather than a token prediction. However, averaging over the sequence may yield better results than using
  the [CLS] token.

+The original code can be found `here <https://github.com/google-research/bert>`_.
+
 BertConfig
 ~~~~~~~~~~~~~~~~~~~~~

@@ -46,6 +48,14 @@ BertTokenizer
 ~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+BertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertTokenizerFast
    :members:


--- a/docs/source/model_doc/camembert.rst
+++ b/docs/source/model_doc/camembert.rst
@@ -22,6 +22,8 @@ Tips:
 - This implementation is the same as RoBERTa. Refer to the `documentation of RoBERTa <./roberta.html>`__ for usage
  examples as well as the information relative to the inputs and outputs.

+The original code can be found `here <https://camembert-model.fr/>`_.
+
 CamembertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -33,7 +35,8 @@ CamembertTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.CamembertTokenizer
-    :members:
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary


 CamembertModel
--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@@ -31,6 +31,8 @@ Tips:
  See `reusing the past in generative models <../quickstart.html#using-the-past>`_ for more information on the usage
  of this argument.

+The original code can be found `here <https://github.com/salesforce/ctrl>`_.
+

 CTRLConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -43,7 +45,7 @@ CTRLTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.CTRLTokenizer
-    :members:
+    :members: save_vocabulary


 CTRLModel
--- a/docs/source/model_doc/dialogpt.rst
+++ b/docs/source/model_doc/dialogpt.rst
@@ -0,0 +1,39 @@
+DialoGPT
+----------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+DialoGPT was proposed in
+`DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`_
+by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+It's a GPT2 Model trained on 147M conversation-like exchanges extracted from Reddit.
+
+The abstract from the paper is the following:
+
+*We present a large, tunable neural conversational response generation model, DialoGPT (dialogue generative pre-trained transformer). 
+Trained on 147M conversation-like exchanges extracted from Reddit comment chains over a period spanning from 2005 through 2017, DialoGPT extends the Hugging Face PyTorch transformer to attain a performance close to human both in terms of automatic and human evaluation in single-turn dialogue settings.
+We show that conversational systems that leverage DialoGPT generate more relevant, contentful and context-consistent responses than strong baseline systems.
+The pre-trained model and training pipeline are publicly released to facilitate research into neural response generation and the development of more intelligent open-domain dialogue systems.*
+
+Tips:
+
+- DialoGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
+  the right rather than the left.
+- DialoGPT was trained with a causal language modeling (CLM) objective on conversational data and is therefore powerful at response generation in open-domain dialogue systems.
+- DialoGPT enables the user to create a chat bot in just 10 lines of code as shown on `DialoGPT's model card <https://huggingface.co/microsoft/DialoGPT-medium>`_.
+
+Training:
+
+In order to train or fine-tune DialoGPT, one can use causal language modeling training. 
+To cite the official paper: 
+*We follow the OpenAI GPT-2 to model a multiturn dialogue session 
+as a long text and frame the generation task as language modeling. We first
+concatenate all dialog turns within a dialogue session into a long text 
+x_1,..., x_N (N is the sequence length), ended by the end-of-text token.* 
+For more information please confer to the original paper.
+    
+
+DialoGPT's architecture is based on the GPT2 model, so one can refer to GPT2's `docstring <https://huggingface.co/transformers/model_doc/gpt2.html>`_.
+
+The original code can be found `here <https://github.com/microsoft/DialoGPT>`_.
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -27,6 +27,8 @@ Tips:
 - DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
 - DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.

+The original code can be found `here <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
+

 DistilBertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -42,6 +44,13 @@ DistilBertTokenizer
    :members:


+DistilBertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DistilBertTokenizerFast
+    :members:
+
+
 DistilBertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/electra.rst
+++ b/docs/source/model_doc/electra.rst
@@ -0,0 +1,124 @@
+ELECTRA
+----------------------------------------------------
+
+The ELECTRA model was proposed in the paper.
+`ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators <https://openreview.net/pdf?id=r1xMH1BtvB>`__.
+ELECTRA is a new pre-training approach which trains two transformer models: the generator and the discriminator. The
+generator's role is to replace tokens in a sequence, and is therefore trained as a masked language model. The discriminator,
+which is the model we're interested in, tries to identify which tokens were replaced by the generator in the sequence.
+
+The abstract from the paper is the following:
+
+*Masked language modeling (MLM) pre-training methods such as BERT corrupt
+the input by replacing some tokens with [MASK] and then train a model to
+reconstruct the original tokens. While they produce good results when transferred
+to downstream NLP tasks, they generally require large amounts of compute to be
+effective. As an alternative, we propose a more sample-efficient pre-training task
+called replaced token detection. Instead of masking the input, our approach
+corrupts it by replacing some tokens with plausible alternatives sampled from a small
+generator network. Then, instead of training a model that predicts the original
+identities of the corrupted tokens, we train a discriminative model that predicts
+whether each token in the corrupted input was replaced by a generator sample
+or not. Thorough experiments demonstrate this new pre-training task is more
+efficient than MLM because the task is defined over all input tokens rather than
+just the small subset that was masked out. As a result, the contextual representations
+learned by our approach substantially outperform the ones learned by BERT
+given the same model size, data, and compute. The gains are particularly strong
+for small models; for example, we train a model on one GPU for 4 days that
+outperforms GPT (trained using 30x more compute) on the GLUE natural language
+understanding benchmark. Our approach also works well at scale, where it
+performs comparably to RoBERTa and XLNet while using less than 1/4 of their
+compute and outperforms them when using the same amount of compute.*
+
+Tips:
+
+- ELECTRA is the pre-training approach, therefore there is nearly no changes done to the underlying model: BERT. The
+  only change is the separation of the embedding size and the hidden size -> The embedding size is generally smaller,
+  while the hidden size is larger. An additional projection layer (linear) is used to project the embeddings from
+  their embedding size to the hidden size. In the case where the embedding size is the same as the hidden size, no
+  projection layer is used.
+- The ELECTRA checkpoints saved using `Google Research's implementation <https://github.com/google-research/electra>`__
+  contain both the generator and discriminator. The conversion script requires the user to name which model to export
+  into the correct architecture. Once converted to the HuggingFace format, these checkpoints may be loaded into all
+  available ELECTRA models, however. This means that the discriminator may be loaded in the `ElectraForMaskedLM` model,
+  and the generator may be loaded in the `ElectraForPreTraining` model (the classification head will be randomly
+  initialized as it doesn't exist in the generator).
+
+The original code can be found `here <https://github.com/google-research/electra>`_.
+
+
+ElectraConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraConfig
+    :members:
+
+
+ElectraTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraTokenizer
+    :members:
+
+
+ElectraTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraTokenizerFast
+    :members:
+
+
+ElectraModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraModel
+    :members:
+
+
+ElectraForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForPreTraining
+    :members:
+
+
+ElectraForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForMaskedLM
+    :members:
+
+
+ElectraForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForTokenClassification
+    :members:
+
+
+TFElectraModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraModel
+    :members:
+
+
+TFElectraForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForPreTraining
+    :members:
+
+
+TFElectraForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForMaskedLM
+    :members:
+
+
+TFElectraForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForTokenClassification
+    :members:
--- a/docs/source/model_doc/encoderdecoder.rst
+++ b/docs/source/model_doc/encoderdecoder.rst
@@ -0,0 +1,23 @@
+Encoder Decoder Models
+-----------
+
+This class can wrap an encoder model, such as ``BertModel`` and a decoder modeling with a language modeling head, such as ``BertForMaskedLM`` into a encoder-decoder model.
+
+The ``EncoderDecoderModel`` class allows to instantiate a encoder decoder model using the ``from_encoder_decoder_pretrain`` class method taking a pretrained encoder and pretrained decoder model as an input. 
+The ``EncoderDecoderModel`` is saved using the standard ``save_pretrained()`` method and can also again be loaded using the standard ``from_pretrained()`` method. 
+
+An application of this architecture could be *summarization* using two pretrained Bert models as is shown in the paper: `Text Summarization with Pretrained Encoders <https://arxiv.org/abs/1910.13461>`_ by Yang Liu and Mirella Lapata. 
+
+
+``EncoderDecoderConfig``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.EncoderDecoderConfig
+    :members:
+
+
+``EncoderDecoderModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.EncoderDecoderModel
+    :members:
--- a/docs/source/model_doc/flaubert.rst
+++ b/docs/source/model_doc/flaubert.rst
@@ -20,6 +20,8 @@ of the time they outperform other pre-training approaches. Different versions of
 evaluation protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared
 to the research community for further reproducible experiments in French NLP.*

+The original code can be found `here <https://github.com/getalp/Flaubert>`_.
+

 FlaubertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -36,6 +36,9 @@ Tips:
 `Write With Transformer <https://transformer.huggingface.co/doc/gpt>`__ is a webapp created and hosted by
 Hugging Face showcasing the generative capabilities of several models. GPT is one of them.

+The original code can be found `here <https://github.com/openai/finetune-transformer-lm>`_.
+
+
 OpenAIGPTConfig
 ~~~~~~~~~~~~~~~~~~~~~

@@ -47,6 +50,13 @@ OpenAIGPTTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.OpenAIGPTTokenizer
+    :members: save_vocabulary
+
+
+OpenAIGPTTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.OpenAIGPTTokenizerFast
    :members:


--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -5,7 +5,7 @@ Overview
 ~~~~~~~~~~~~~~~~~~~~~

 OpenAI GPT-2 model was proposed in
-`Language Models are Unsupervised Multitask Learners`_
+`Language Models are Unsupervised Multitask Learners <https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`_
 by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 It's a causal (unidirectional) transformer pre-trained using  language modeling on a very large
 corpus of ~40 GB of text data.
@@ -34,6 +34,8 @@ Tips:
 Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
 different sizes: small, medium, large, xl and a distilled version of the small checkpoint: distilgpt-2.

+The original code can be found `here <https://openai.com/blog/better-language-models/>`_.
+

 GPT2Config
 ~~~~~~~~~~~~~~~~~~~~~
@@ -46,6 +48,13 @@ GPT2Tokenizer
 ~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2Tokenizer
+    :members: save_vocabulary
+
+
+GPT2TokenizerFast
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPT2TokenizerFast
    :members:


--- a/docs/source/model_doc/longformer.rst
+++ b/docs/source/model_doc/longformer.rst
@@ -0,0 +1,91 @@
+Longformer
+----------------------------------------------------
+**DISCLAIMER:** This model is still a work in progress, if you see something strange,
+file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`_
+
+Overview
+~~~~~
+The Longformer model was presented in `Longformer: The Long-Document Transformer <https://arxiv.org/pdf/2004.05150.pdf>`_ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+Here the abstract: 
+
+*Transformer-based models are unable to process long sequences due to their self-attention operation, which scales quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or longer. Longformer's attention mechanism is a drop-in replacement for the standard self-attention and combines a local windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on WikiHop and TriviaQA.*
+
+The Authors' code can be found `here <https://github.com/allenai/longformer>`_ .
+
+Longformer Self Attention
+~~~~~~~~~~~~~~~~~~~~
+Longformer self attention employs self attention on both a "local" context and a "global" context.
+Most tokens only attend "locally" to each other meaning that each token attends to its :math:`\frac{1}{2} w` previous tokens and :math:`\frac{1}{2} w` succeding tokens with :math:`w` being the window length as defined in `config.attention_window`. Note that `config.attention_window` can be of type ``list`` to define a different :math:`w` for each layer. 
+A selecetd few tokens attend "globally" to all other tokens, as it is conventionally done for all tokens in *e.g.* `BertSelfAttention`.
+
+Note that "locally" and "globally" attending tokens are projected by different query, key and value matrices.
+Also note that every "locally" attending token not only attends to tokens within its window :math:`w`, but also to all "globally" attending tokens so that global attention is *symmetric*.
+
+The user can define which tokens attend "locally" and which tokens attend "globally" by setting the tensor `global_attention_mask` at run-time appropriately. `Longformer` employs the following logic for `global_attention_mask`: `0` - the token attends "locally", `1` - token attends "globally". For more information please also refer to :func:`~transformers.LongformerModel.forward` method.
+
+Using Longformer self attention, the memory and time complexity of the query-key matmul operation, which usually represents the memory and time bottleneck, can be reduced from :math:`\mathcal{O}(n_s \times n_s)` to :math:`\mathcal{O}(n_s \times w)`, with :math:`n_s` being the sequence length and :math:`w` being the average window size. It is assumed that the number of "globally" attending tokens is insignificant as compared to the number of "locally" attending tokens.
+
+For more information, please refer to the official `paper <https://arxiv.org/pdf/2004.05150.pdf>`_ .
+
+
+Training
+~~~~~~~~~~~~~~~~~~~~
+``LongformerForMaskedLM`` is trained the exact same way, ``RobertaForMaskedLM`` is trained and 
+should be used as follows:
+
+::
+
+  input_ids = tokenizer.encode('This is a sentence from [MASK] training data', return_tensors='pt')
+  mlm_labels = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
+
+  loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
+
+
+LongformerConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerConfig
+    :members:
+
+
+LongformerTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerTokenizer
+    :members: 
+
+
+LongformerModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerModel
+    :members:
+
+
+LongformerForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForMaskedLM
+    :members:
+
+
+LongformerForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForQuestionAnswering
+    :members:
+
+
+LongformerForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForMultipleChoice
+    :members:
+
+
+LongformerForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForTokenClassification
+    :members:
+
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -0,0 +1,105 @@
+MarianMT
+----------------------------------------------------
+**DISCLAIMER:** If you see something strange,
+file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
+@sshleifer. Translations should be similar, but not identical to, output in the test set linked to in each model card.
+
+Implementation Notes
+~~~~~~~~~~~~~~~~~~~~
+- each model is about 298 MB on disk, there are 1,000+ models.
+- The list of supported language pairs can be found `here <https://huggingface.co/Helsinki-NLP>`__.
+- The 1,000+ models were originally trained by `Jörg Tiedemann <https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann>`__ using the `Marian <https://marian-nmt.github.io/>`_ C++ library, which supports fast training and translation.
+- All models are transformer encoder-decoders with 6 layers in each component. Each model's performance is documented in a model card.
+- the 80 opus models that require BPE preprocessing are not supported.
+- The modeling code is the same as ``BartForConditionalGeneration`` with a few minor modifications:
+    - static (sinusoid) positional embeddings (``MarianConfig.static_position_embeddings=True``)
+    - a new final_logits_bias (``MarianConfig.add_bias_logits=True``)
+    - no layernorm_embedding (``MarianConfig.normalize_embedding=False``)
+    - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix. (Bart uses <s/>)
+- Code to bulk convert models can be found in ``convert_marian_to_pytorch.py``
+
+Naming
+~~~~~~
+- All  model names use the following format: ``Helsinki-NLP/opus-mt-{src}-{tgt}``
+- The language codes used to name models are inconsistent. Two digit codes can usually be found `here <https://developers.google.com/admin-sdk/directory/v1/languages>`_, three digit codes require googling "language code {code}".
+- Codes formatted like ``es_AR`` are usually ``code_{region}``. That one is spanish documents from Argentina.
+
+
+Multilingual Models
+~~~~~~~~~~~~~~~~~~~~
+
+All  model names use the following format: ``Helsinki-NLP/opus-mt-{src}-{tgt}``:
+    - if ``src`` is in all caps, the model supports multiple input languages, you can figure out which ones by looking at the model card, or the Group Members `mapping <https://gist.github.com/sshleifer/6d20e7761931b08e73c3219027b97b8a>`_ .
+    - if ``tgt`` is in all caps, the model can output multiple languages, and you should specify a language code by prepending the desired output language to the src_text
+    - You can see a tokenizer's supported language codes in ``tokenizer.supported_language_codes``
+
+Example of translating english to many romance languages, using language codes:
+
+.. code-block:: python
+
+    from transformers import MarianMTModel, MarianTokenizer
+    src_text = [
+        '>>fr<< this is a sentence in english that we want to translate to french',
+        '>>pt<< This should go to portuguese',
+        '>>es<< And this to Spanish'
+    ]
+
+    model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
+    tokenizer = MarianTokenizer.from_pretrained(model_name)
+    print(tokenizer.supported_language_codes)
+    model = MarianMTModel.from_pretrained(model_name)
+    translated = model.generate(**tokenizer.prepare_translation_batch(src_text))
+    tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
+    # ["c'est une phrase en anglais que nous voulons traduire en français",
+    # 'Isto deve ir para o português.',
+    # 'Y esto al español']
+
+Sometimes, models were trained on collections of languages that do not resolve to a group. In this case, _ is used as a separator for src or tgt, as in ``'Helsinki-NLP/opus-mt-en_el_es_fi-en_el_es_fi'``. These still require language codes.
+There are many supported regional language codes, like ``>>es_ES<<`` (Spain) and ``>>es_AR<<`` (Argentina), that do not seem to change translations. I have not found these to provide different results than just using ``>>es<<``.
+
+For Example:
+    - ``Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU``: translates from all NORTH_EU languages (see `mapping <https://gist.github.com/sshleifer/6d20e7761931b08e73c3219027b97b8a>`_) to all NORTH_EU languages. Use a special language code like ``>>de<<`` to specify output language.
+    - ``Helsinki-NLP/opus-mt-ROMANCE-en``: translates from many romance languages to english, no codes needed since there is only 1 tgt language.
+
+
+
+.. code-block:: python
+
+    GROUP_MEMBERS = {
+     'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'],
+     'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'],
+     'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
+     'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
+     'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'],
+     'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'],
+     'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv']
+    }
+
+Code to see available pretrained models:
+
+.. code-block:: python
+
+    from transformers.hf_api import HfApi
+    model_list = HfApi().model_list()
+    org = "Helsinki-NLP"
+    model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
+    suffix = [x.split('/')[1] for x in model_ids]
+    multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]
+
+MarianMTModel
+~~~~~~~~~~~~~
+
+Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints.
+Model API is identical to BartForConditionalGeneration.
+Available models are listed at `Model List <https://huggingface.co/models?search=Helsinki-NLP>`__
+This class inherits all functionality from ``BartForConditionalGeneration``, see that page for method signatures.
+
+.. autoclass:: transformers.MarianMTModel
+    :members:
+
+
+MarianTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MarianTokenizer
+    :members: prepare_translation_batch
--- a/docs/source/model_doc/reformer.rst
+++ b/docs/source/model_doc/reformer.rst
@@ -0,0 +1,114 @@
+Reformer
+----------------------------------------------------
+**DISCLAIMER:** This model is still a work in progress, if you see something strange,
+file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`_
+
+Overview
+~~~~~
+The Reformer model was presented in `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451.pdf>`_ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+Here the abstract: 
+
+*Large Transformer models routinely achieve state-of-the-art results on a number of tasks but training these models can be prohibitively costly, especially on long sequences. We introduce two techniques to improve the efficiency of Transformers. For one, we replace dot-product attention by one that uses locality-sensitive hashing, changing its complexity from O(L^2) to O(Llog(L)), where L is the length of the sequence. Furthermore, we use reversible residual layers instead of the standard residuals, which allows storing activations only once in the training process instead of N times, where N is the number of layers. The resulting model, the Reformer, performs on par with Transformer models while being much more memory-efficient and much faster on long sequences.*
+
+The Authors' code can be found `here <https://github.com/google/trax/tree/master/trax/models/reformer>`_ .
+
+Axial Positional Encodings
+~~~~~~~~~~~~~~~~~~~~
+Axial Positional Encodings were first implemented in Google's `trax library <https://github.com/google/trax/blob/4d99ad4965bab1deba227539758d59f0df0fef48/trax/layers/research/position_encodings.py#L29>`_ and developed by the authors of this model's paper. In models that are treating very long input sequences, the conventional position id encodings store an embedings vector of size :math:`d` being the ``config.hidden_size`` for every position :math:`i, \ldots, n_s`, with :math:`n_s` being ``config.max_embedding_size``. *E.g.*, having a sequence length of :math:`n_s = 2^{19} \approx 0.5M` and a ``config.hidden_size`` of :math:`d = 2^{10} \approx 1000` would result in a position encoding matrix:
+
+.. math::
+    X_{i,j}, \text{ with } i \in \left[1,\ldots, d\right] \text{ and } j \in \left[1,\ldots, n_s\right] 
+
+which alone has over 500M parameters to store. Axial positional encodings factorize :math:`X_{i,j}` into two matrices: 
+
+.. math::
+    X^{1}_{i,j}, \text{ with } i \in \left[1,\ldots, d^1\right] \text{ and } j \in \left[1,\ldots, n_s^1\right] 
+
+and 
+
+.. math::
+    X^{2}_{i,j}, \text{ with } i \in \left[1,\ldots, d^2\right] \text{ and } j \in \left[1,\ldots, n_s^2\right] 
+
+with:
+
+.. math::
+    d = d^1 + d^2 \text{ and } n_s = n_s^1 \times n_s^2 .
+
+Therefore the following holds:
+
+.. math::
+    X_{i,j} = \begin{cases}
+                X^{1}_{i, k}, & \text{if }\ i < d^1 \text{ with } k = j \mod n_s^1 \\
+                X^{2}_{i - d^1, l}, & \text{if } i \ge d^1 \text{ with } l = \lfloor\frac{j}{n_s^1}\rfloor
+              \end{cases}
+
+Intuitively, this means that a position embedding vector :math:`x_j \in \mathbb{R}^{d}` is now the composition of two factorized embedding vectors: :math:`x^1_{k, l} + x^2_{l, k}`, where as the ``config.max_embedding_size`` dimension :math:`j` is factorized into :math:`k \text{ and } l`.
+This design ensures that each position embedding vector :math:`x_j` is unique.
+
+Using the above example again, axial position encoding with :math:`d^1 = 2^5, d^2 = 2^5, n_s^1 = 2^9, n_s^2 = 2^{10}` can drastically reduced the number of parameters to :math:`2^{14} + 2^{15} \approx 49000` parameters.
+
+In practice, the parameter ``config.axial_pos_embds_dim`` is set to ``list``:math:`(d^1, d^2)` which sum has to be equal to ``config.hidden_size`` and ``config.axial_pos_shape`` is set to ``list``:math:`(n_s^1, n_s^2)` and which product has to be equal to ``config.max_embedding_size`` which during training has to be equal to the ``sequence length`` of the ``input_ids``.
+
+
+
+LSH Self Attention
+~~~~~~~~~~~~~~~~~~~~
+In Locality sensitive hashing (LSH) self attention the key and query projection weights are tied. Therefore, the key query embedding vectors are also tied.
+LSH self attention uses the locality sensitive 
+hashing mechanism proposed in `Practical and Optimal LSH for Angular Distance <https://arxiv.org/abs/1509.02897>`_ to assign each of the tied key query embedding vectors to one of ``config.num_buckets`` possible buckets. The premise is that the more "similar" key query embedding vectors (in terms of *cosine similarity*) are to each other, the more likely they are assigned to the same bucket. 
+The accuracy of the LSH mechanism can be improved by increasing ``config.num_hashes`` or directly the argument ``num_hashes`` of the forward function so that the output of the LSH self attention better approximates the output of the "normal" full self attention.
+The buckets are then sorted and chunked into query key embedding vector chunks each of length ``config.lsh_chunk_length``. For each chunk, the query embedding vectors attend to its key vectors (which are tied to themselves) and to the key embedding vectors of ``config.lsh_num_chunks_before`` previous neighboring chunks and ``config.lsh_num_chunks_after`` following neighboring chunks.
+For more information, see the `original Paper <https://arxiv.org/abs/2001.04451>`_ or this great `blog post <https://www.pragmatic.ml/reformer-deep-dive/>`_.
+
+Note that ``config.num_buckets`` can also be factorized into a ``list``:math:`(n_{\text{buckets}}^1, n_{\text{buckets}}^2)`. This way instead of assigning the query key embedding vectors to one of :math:`(1,\ldots, n_{\text{buckets}})` they are assigned to one of :math:`(1-1,\ldots, n_{\text{buckets}}^1-1, \ldots, 1-n_{\text{buckets}}^2, \ldots, n_{\text{buckets}}^1-n_{\text{buckets}}^2)`. This is crucial for very long sequences to save memory.
+
+When training a model from scratch, it is recommended to leave ``config.num_buckets=None``, so that depending on the sequence length a good value for ``num_buckets`` is calculated on the fly. This value will then automatically be saved in the config and should be reused for inference.
+
+Using LSH self attention, the memory and time complexity of the query-key matmul operation can be reduced from :math:`\mathcal{O}(n_s \times n_s)` to :math:`\mathcal{O}(n_s \times \log(n_s))`, which usually represents the memory and time bottleneck in a transformer model, with :math:`n_s` being the sequence length.
+
+
+Local Self Attention
+~~~~~~~~~~~~~~~~~~~~
+Local self attention is essentially a "normal" self attention layer with 
+key, query and value projections, but is chunked so that in each chunk of length ``config.local_chunk_length`` the query embedding vectors only attends to the key embedding vectors in its chunk and to the key embedding vectors of ``config.local_num_chunks_before`` previous neighboring chunks and ``config.local_num_chunks_after`` following neighboring chunks.
+
+Using Local self attention, the memory and time complexity of the query-key matmul operation can be reduced from :math:`\mathcal{O}(n_s \times n_s)` to :math:`\mathcal{O}(n_s \times \log(n_s))`, which usually represents the memory and time bottleneck in a transformer model, with :math:`n_s` being the sequence length.
+
+
+Training
+~~~~~~~~~~~~~~~~~~~~
+During training, we must ensure that the sequence length is set to a value that can be divided by the least common multiple of ``config.lsh_chunk_length`` and ``config.local_chunk_length`` and that the parameters of the Axial Positional Encodings are correctly set as described above. Reformer is very memory efficient so that the model can easily be trained on sequences as long as 64000 tokens.
+For training, the ``ReformerModelWithLMHead`` should be used as follows: 
+
+::
+
+  input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
+  loss = model(input_ids, labels=input_ids)[0]
+
+
+ReformerConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerConfig
+    :members:
+
+
+ReformerTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerTokenizer
+    :members: 
+
+
+ReformerModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerModel
+    :members:
+
+
+ReformerModelWithLMHead
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerModelWithLMHead
+    :members:
--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@@ -23,8 +23,14 @@ Tips:

 - This implementation is the same as :class:`~transformers.BertModel` with a tiny embeddings tweak as well as a
  setup for Roberta pretrained models.
+- RoBERTa has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a
+  different pre-training scheme.
+- RoBERTa doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `</s>`)
 - `Camembert <./camembert.html>`__ is a wrapper around RoBERTa. Refer to this page for usage examples.

+The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_.
+
+
 RobertaConfig
 ~~~~~~~~~~~~~~~~~~~~~

@@ -36,7 +42,15 @@ RobertaTokenizer
 ~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RobertaTokenizer
-    :members:
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+RobertaTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RobertaTokenizerFast
+    :members: build_inputs_with_special_tokens


 RobertaModel
@@ -60,6 +74,13 @@ RobertaForSequenceClassification
    :members:


+RobertaForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RobertaForMultipleChoice
+    :members:
+
+
 RobertaForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@@ -0,0 +1,105 @@
+T5
+----------------------------------------------------
+**DISCLAIMER:** This model is still a work in progress, if you see something strange,
+file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`_
+
+Overview
+~~~~~
+The T5 model was presented in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer <https://arxiv.org/pdf/1910.10683.pdf>`_ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu in 
+Here the abstract: 
+
+*Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning has given rise to a diversity of approaches, methodology, and practice. 
+In this paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a text-to-text format. 
+Our systematic study compares pre-training objectives, architectures, unlabeled datasets, transfer approaches, and other factors on dozens of language understanding tasks. 
+By combining the insights from our exploration with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. 
+To facilitate future work on transfer learning for NLP, we release our dataset, pre-trained models, and code.*
+
+The Authors' code can be found `here <https://github.com/google-research/text-to-text-transfer-transformer>`_ .
+
+Training
+~~~~~~~~~~~~~~~~~~~~
+T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher forcing.
+This means that for training we always need an input sequence and a target sequence. 
+The input sequence is fed to the model using ``input_ids``. The target sequence is shifted to the right, *i.e.* prepended by a start-sequence token and fed to the decoder using the `decoder_input_ids`. In teacher-forcing style, the target sequence is then appended by the EOS token and corresponds to the ``lm_labels``. The PAD token is hereby used as the start-sequence token.
+T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
+
+- Unsupervised denoising training
+
+  In this setup spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) 
+  and the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. 
+  Each sentinel token represents a unique mask token for this sentence and should start with ``<extra_id_1>``, ``<extra_id_2>``, ... up to ``<extra_id_100>``. As a default 100 sentinel tokens are available in ``T5Tokenizer``.
+  *E.g.* the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be processed as follows: 
+
+::
+
+  input_ids = tokenizer.encode('The <extra_id_1> walks in <extra_id_2> park', return_tensors='pt')
+  lm_labels = tokenizer.encode('<extra_id_1> cute dog <extra_id_2> the <extra_id_3> </s>', return_tensors='pt')
+  # the forward function automatically creates the correct decoder_input_ids
+  model(input_ids=input_ids, lm_labels=lm_labels)
+
+- Supervised training
+
+  In this setup the input sequence and output sequence are standard sequence to sequence input output mapping.
+  In translation, *e.g.* the input sequence "The house is wonderful." and output sequence "Das Haus ist wunderbar." should 
+  be processed as follows:
+  
+::
+
+  input_ids = tokenizer.encode('translate English to German: The house is wonderful. </s>', return_tensors='pt')
+  lm_labels = tokenizer.encode('Das Haus ist wunderbar. </s>', return_tensors='pt')
+  # the forward function automatically creates the correct decoder_input_ids
+  model(input_ids=input_ids, lm_labels=lm_labels)
+
+Tips
+~~~~~~~~~~~~~~~~~~~~
+- T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised 
+  and supervised tasks and for which each task is converted into a text-to-text format.
+  T5 works well on a variety of tasks out-of-the-box by prepending a different prefix to the input corresponding to each task, e.g.: for translation: *translate English to German: ..., summarize: ...*.
+  For more information about which prefix to use, it is easiest to look into Appendix D of the `paper <https://arxiv.org/pdf/1910.10683.pdf>`_ .
+- For sequence to sequence generation, it is recommended to use ``T5ForConditionalGeneration.generate()``. The method takes care of feeding the encoded input via cross-attention layers to the decoder and auto-regressively generates the decoder output.
+- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
+
+The original code can be found `here <https://github.com/google-research/text-to-text-transfer-transformer>`_.
+
+
+T5Config
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.T5Config
+    :members:
+
+
+T5Tokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.T5Tokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+T5Model
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.T5Model
+    :members:
+
+
+T5ForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.T5ForConditionalGeneration
+    :members:
+
+
+TFT5Model
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFT5Model
+    :members:
+
+
+TFT5ForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFT5ForConditionalGeneration
+    :members:
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -30,6 +30,8 @@ Tips:
  The original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
 - Transformer-XL is one of the few models that has no sequence length limit.

+The original code can be found `here <https://github.com/kimiyoung/transformer-xl>`_.
+

 TransfoXLConfig
 ~~~~~~~~~~~~~~~~~~~~~
@@ -42,6 +44,13 @@ TransfoXLTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TransfoXLTokenizer
+    :members: save_vocabulary
+
+
+TransfoXLTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TransfoXLTokenizerFast
    :members:


--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -30,6 +30,8 @@ Tips:
 - XLM has multilingual checkpoints which leverage a specific `lang` parameter. Check out the
  `multi-lingual <../multilingual.html>`__ page for more information.

+The original code can be found `here <https://github.com/facebookresearch/XLM/>`_.
+

 XLMConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -41,7 +43,8 @@ XLMTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.XLMTokenizer
-    :members:
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary

 XLMModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/xlmroberta.rst
+++ b/docs/source/model_doc/xlmroberta.rst
@@ -22,9 +22,15 @@ and XNLI benchmarks. We will make XLM-R code, data, and models publicly availabl

 Tips:

+- XLM-R is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does
+  not require `lang` tensors to understand which language is used, and should be able to determine the correct
+  language from the input ids.
 - This implementation is the same as RoBERTa. Refer to the `documentation of RoBERTa <./roberta.html>`__ for usage
  examples as well as the information relative to the inputs and outputs.

+The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_.
+
+
 XLMRobertaConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -36,7 +42,8 @@ XLMRobertaTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.XLMRobertaTokenizer
-    :members:
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary


 XLMRobertaModel
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -29,9 +29,11 @@ Tips:
  XLNet is pretrained using only a sub-set of the output tokens as target which are selected
  with the `target_mapping` input.
 - To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and
-  `target_mapping` inputs to control the attention span and outputs (see examples in `examples/run_generation.py`)
+  `target_mapping` inputs to control the attention span and outputs (see examples in `examples/text-generation/run_generation.py`)
 - XLNet is one of the few models that has no sequence length limit.

+The original code can be found `here <https://github.com/zihangdai/xlnet/>`_.
+

 XLNetConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -44,7 +46,8 @@ XLNetTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.XLNetTokenizer
-    :members:
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary


 XLNetModel
--- a/docs/source/model_sharing.md
+++ b/docs/source/model_sharing.md
@@ -2,7 +2,7 @@

 Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.

-**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
+**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Optionally, join an existing organization or create a new one. Then:

 ```shell
 transformers-cli login
@@ -21,25 +21,35 @@ transformers-cli upload ./config.json [--filename folder/foobar.json]
 # (you can optionally override its filename, which can be nested inside a folder)
 ```

-Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
+If you want your model to be namespaced by your organization name rather than your username, add the following flag to any command:
+```shell
+--organization organization_name
+```
+
+Your model will then be accessible through its identifier, a concatenation of your username (or organization name) and the folder name above:
 ```python
 "username/pretrained_model"
+# or if an org:
+"organization_name/pretrained_model"
 ```

+**Please add a README.md model card** to the repo under `model_cards/` with: model description, training params (dataset, preprocessing, hardware used, hyperparameters), evaluation results, intended uses & limitations, etc.
+
+Your model now has a page on huggingface.co/models 🔥
+
 Anyone can load it from code:
 ```python
-tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
-model = AutoModel.from_pretrained("username/pretrained_model")
+tokenizer = AutoTokenizer.from_pretrained("namespace/pretrained_model")
+model = AutoModel.from_pretrained("namespace/pretrained_model")
 ```

-Finally, list all your files on S3:
+List all your files on S3:
 ```shell
 transformers-cli s3 ls
-# List all your S3 objects.
 ```

-You can also delete files:
+You can also delete unneeded files:

 ```shell
 transformers-cli s3 rm …
-```
+```
--- a/docs/source/multilingual.rst
+++ b/docs/source/multilingual.rst
@@ -47,6 +47,7 @@ The different languages this model/tokenizer handles, as well as the ids of thes

 .. code-block::

+    # Continuation of the previous script
    print(tokenizer.lang2id)  # {'en': 0, 'fr': 1}


@@ -54,6 +55,7 @@ These ids should be used when passing a language parameter during a model pass.

 .. code-block::

+    # Continuation of the previous script
    input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1


@@ -62,6 +64,7 @@ filled with the appropriate language ids, of the same size as input_ids. For eng

 .. code-block::

+    # Continuation of the previous script
    language_id = tokenizer.lang2id['en']  # 0
    langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])

@@ -73,10 +76,11 @@ You can then feed it all as input to your model:

 .. code-block::

+    # Continuation of the previous script
    outputs = model(input_ids, langs=langs)


-The example `run_generation.py <https://github.com/huggingface/transformers/blob/master/examples/run_generation.py>`__
+The example `run_generation.py <https://github.com/huggingface/transformers/blob/master/examples/text-generation/run_generation.py>`__
 can generate text using the CLM checkpoints from XLM, using the language embeddings.

 XLM without Language Embeddings
@@ -100,4 +104,16 @@ BERT has two checkpoints that can be used for multi-lingual tasks:
 - ``bert-base-multilingual-cased`` (Masked language modeling + Next sentence prediction, 104 languages)

 These checkpoints do not require language embeddings at inference time. They should identify the language
-used in the context and infer accordingly.
+used in the context and infer accordingly.
+
+XLM-RoBERTa
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+XLM-RoBERTa was trained on 2.5TB of newly created clean CommonCrawl data in 100 languages. It provides strong
+gains over previously released multi-lingual models like mBERT or XLM on downstream taks like classification,
+sequence labeling and question answering.
+
+Two XLM-RoBERTa checkpoints can be used for multi-lingual tasks:
+
+- ``xlm-roberta-base`` (Masked language modeling, 100 languages)
+- ``xlm-roberta-large`` (Masked language modeling, 100 languages)
--- a/docs/source/notebooks.md
+++ b/docs/source/notebooks.md
@@ -0,0 +1 @@
+../../notebooks/README.md
--- a/docs/source/notebooks.rst
+++ b/docs/source/notebooks.rst
@@ -1,16 +0,0 @@
-Notebooks
-================================================
-
-We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
-
-
-*
-  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
-
-*
-  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
-
-*
-  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
-
-Please follow the instructions given in the notebooks to run and modify them.
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -63,33 +63,33 @@ For a list that includes community-uploaded models, refer to `https://huggingfac
 |                   |                                                            | | Trained on uncased German text by DBMDZ                                                                                             |
 |                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-japanese``                                     | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``cl-tohoku/bert-base-japanese``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece.                                                               |
 |                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-japanese-whole-word-masking``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``cl-tohoku/bert-base-japanese-whole-word-masking``        | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece.                                      |
 |                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-japanese-char``                                | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``cl-tohoku/bert-base-japanese-char``                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text. Text is tokenized into characters.                                                                        |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-japanese-char-whole-word-masking``             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``cl-tohoku/bert-base-japanese-char-whole-word-masking``   | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                               |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-finnish-cased-v1``                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``TurkuNLP/bert-base-finnish-cased-v1``                    | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on cased Finnish text.                                                                                                      |
 |                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-finnish-uncased-v1``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``TurkuNLP/bert-base-finnish-uncased-v1``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on uncased Finnish text.                                                                                                    |
 |                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-dutch-cased``                                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``wietsedv/bert-base-dutch-cased``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on cased Dutch text.                                                                                                        |
 |                   |                                                            | (see `details on wietsedv repository <https://github.com/wietsedv/bertje/>`__).                                                       |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
@@ -179,6 +179,14 @@ For a list that includes community-uploaded models, refer to `https://huggingfac
 |                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer.                 |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``distilbert-base-cased``                                  | | 6-layer, 768-hidden, 12-heads, 65M parameters                                                                                       |
+|                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint                                                     |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``distilbert-base-cased-distilled-squad``                  | | 6-layer, 768-hidden, 12-heads, 65M parameters                                                                                       |
+|                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint, with an additional question answering layer.       |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilgpt2``                                             | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
 |                   |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
@@ -251,22 +259,55 @@ For a list that includes community-uploaded models, refer to `https://huggingfac
 |                   | ``xlm-roberta-large``                                      | | ~355M parameters with 24-layers, 1027-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
 |                   |                                                            | | Trained on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                          |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| FlauBERT          | ``flaubert-small-cased``                                   | | 6-layer, 512-hidden, 8-heads, 54M parameters                                                                                        |
+| FlauBERT          | ``flaubert/flaubert_small_cased``                          | | 6-layer, 512-hidden, 8-heads, 54M parameters                                                                                        |
 |                   |                                                            | | FlauBERT small architecture                                                                                                         |
 |                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``flaubert-base-uncased``                                  | | 12-layer, 768-hidden, 12-heads, 137M parameters                                                                                     |
+|                   | ``flaubert/flaubert_base_uncased``                         | | 12-layer, 768-hidden, 12-heads, 137M parameters                                                                                     |
 |                   |                                                            | | FlauBERT base architecture with uncased vocabulary                                                                                  |
 |                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``flaubert-base-cased``                                    | | 12-layer, 768-hidden, 12-heads, 138M parameters                                                                                     |
+|                   | ``flaubert/flaubert_base_cased``                           | | 12-layer, 768-hidden, 12-heads, 138M parameters                                                                                     |
 |                   |                                                            | | FlauBERT base architecture with cased vocabulary                                                                                    |
 |                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``flaubert-large-cased``                                   | | 24-layer, 1024-hidden, 16-heads, 373M parameters                                                                                    |
+|                   | ``flaubert/flaubert_large_cased``                          | | 24-layer, 1024-hidden, 16-heads, 373M parameters                                                                                    |
 |                   |                                                            | | FlauBERT large architecture                                                                                                         |
 |                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-
-
-.. <https://huggingface.co/transformers/examples.html>`__
+| Bart              | ``facebook/bart-large``                                    | | 24-layer, 1024-hidden, 16-heads, 406M parameters                                                                                    |
+|                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_)                                                       |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``facebook/bart-large-mnli``                               | | Adds a 2 layer classification head with 1 million parameters                                                                        |
+|                   |                                                            | | bart-large base architecture with a classification head, finetuned on MNLI                                                          |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``facebook/bart-large-cnn``                                | | 12-layer, 1024-hidden, 16-heads, 406M parameters       (same as base)                                                               |
+|                   |                                                            | | bart-large base architecture finetuned on cnn summarization task                                                                    |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``facebook/mbart-large-en-ro``                             | | 12-layer, 1024-hidden, 16-heads, 880M parameters                                                                                    |
+|                   |                                                            | | bart-large architecture pretrained on cc25 multilingual data , finetuned on WMT english romanian translation.                       |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| DialoGPT          | ``DialoGPT-small``                                         | | 12-layer, 768-hidden, 12-heads, 124M parameters                                                                                     |
+|                   |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``DialoGPT-medium``                                        | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
+|                   |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``DialoGPT-large``                                         | | 36-layer, 1280-hidden, 20-heads, 774M parameters                                                                                    |
+|                   |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| Reformer          | ``reformer-enwik8``                                        | | 12-layer, 1024-hidden, 8-heads, 149M parameters                                                                                     |
+|                   |                                                            | | Trained on English Wikipedia data - enwik8.                                                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``reformer-crime-and-punishment``                          | | 6-layer, 256-hidden, 2-heads, 3M parameters                                                                                         |
+|                   |                                                            | | Trained on English text: Crime and Punishment novel by Fyodor Dostoyevsky.                                                          |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| MarianMT          | ``Helsinki-NLP/opus-mt-{src}-{tgt}``                       | | 12-layer, 512-hidden, 8-heads, ~74M parameter Machine translation models. Parameter counts vary depending on vocab size.            |
+|                   |                                                            | | (see `model list <https://huggingface.co/Helsinki-NLP>`_)                                                                           |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| Longformer        | ``allenai/longformer-base-4096``                           | | 12-layer, 768-hidden, 12-heads, ~149M parameters                                                                                    |
+|                   |                                                            | | Starting from RoBERTa-base checkpoint, trained on documents of max length 4,096                                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``allenai/longformer-large-4096``                          | | 24-layer, 1024-hidden, 16-heads, ~435M parameters                                                                                   |
+|                   |                                                            | | Starting from RoBERTa-large checkpoint, trained on documents of max length 4,096                                                    |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -8,7 +8,7 @@ The library was designed with two strong goals in mind:

 - be as easy and fast to use as possible:

-  - we strongly limited the number of user-facing abstractions to learn, in fact there are almost no abstractions, just three standard classes required to use each model: configuration, models and tokenizer,
+  - we strongly limited the number of user-facing abstractions to learn, in fact, there are almost no abstractions, just three standard classes required to use each model: configuration, models and tokenizer,
  - all of these classes can be initialized in a simple and unified way from pretrained instances by using a common `from_pretrained()` instantiation method which will take care of downloading (if needed), caching and loading the related class from a pretrained instance supplied in the library or your own saved instance.
  - as a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to extend/build-upon the library, just use regular Python/PyTorch modules and inherit from the base classes of the library to reuse functionalities like model loading/saving.

@@ -31,27 +31,27 @@ A few other goals:

 ## Main concepts

-The library is build around three type of classes for each models:
+The library is build around three types of classes for each model:

- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 8 models architectures currently provided in the library, e.g. `BertModel`
- **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig`. You don't always need to instantiate these your-self, in particular if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model)
- **tokenizer classes** which store the vocabulary for each model and provide methods for encoding/decoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer`
+- **model classes**  e.g., `BertModel` which are 20+ PyTorch models (`torch.nn.Modules`) that work with the pretrained weights provided in the library. In TF2, these are `tf.keras.Model`.
+- **configuration classes** which store all the parameters required to build a model, e.g., `BertConfig`. You don't always need to instantiate these your-self. In particular, if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model)
+- **tokenizer classes** which store the vocabulary for each model and provide methods for encoding/decoding strings in a list of token embeddings indices to be fed to a model, e.g., `BertTokenizer`

 All these classes can be instantiated from pretrained instances and saved locally using two methods:

 - `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
 - `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.

-We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts:
+We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized into two parts:

 - the **MAIN CLASSES** section details the common functionalities/method/attributes of the three main type of classes (configuration, model, tokenizer) plus some optimization related classes provided as utilities for training,
- the **PACKAGE REFERENCE** section details all the variants of each class for each model architectures and in particular the input/output that you should expect when calling each of them.
+- the **PACKAGE REFERENCE** section details all the variants of each class for each model architectures and, in particular, the input/output that you should expect when calling each of them.

 ## Quick tour: Usage

 Here are two examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.

-See full API reference for examples for each model class.
+See the full API reference for examples of each model class.

 ### BERT example

@@ -191,7 +191,7 @@ Examples for each model class of each model architecture (Bert, GPT, GPT-2, Tran

 #### Using the past

-GPT-2 as well as some other models (GPT, XLNet, Transfo-XL, CTRL) make use of a `past` or `mems` attribute which can be used to prevent re-computing the key/value pairs when using sequential decoding. It is useful when generating sequences as a big part of the attention mechanism benefits from previous computations.
+GPT-2, as well as some other models (GPT, XLNet, Transfo-XL, CTRL), make use of a `past` or `mems` attribute which can be used to prevent re-computing the key/value pairs when using sequential decoding. It is useful when generating sequences as a big part of the attention mechanism benefits from previous computations.

 Here is a fully-working example using the `past` with `GPT2LMHeadModel` and argmax decoding (which should only be used as an example, as argmax decoding introduces a lot of repetition):

@@ -209,7 +209,7 @@ past = None
 for i in range(100):
    print(i)
    output, past = model(context, past=past)
-    token = torch.argmax(output[0, :])
+    token = torch.argmax(output[..., -1, :])

    generated += [token.tolist()]
    context = token.unsqueeze(0)
@@ -220,96 +220,3 @@ print(sequence)
 ```

 The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
-
-### Model2Model example
-
-Encoder-decoder architectures require two tokenized inputs: one for the encoder and the other one for the decoder. Let's assume that we want to use `Model2Model` for generative question answering, and start by tokenizing the question and answer that will be fed to the model.
-
-```python
-import torch
-from transformers import BertTokenizer, Model2Model
-
-# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
-import logging
-logging.basicConfig(level=logging.INFO)
-
-# Load pre-trained model tokenizer (vocabulary)
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-
-# Encode the input to the encoder (the question)
-question = "Who was Jim Henson?"
-encoded_question = tokenizer.encode(question)
-
-# Encode the input to the decoder (the answer)
-answer = "Jim Henson was a puppeteer"
-encoded_answer = tokenizer.encode(answer)
-
-# Convert inputs to PyTorch tensors
-question_tensor = torch.tensor([encoded_question])
-answer_tensor = torch.tensor([encoded_answer])
-```
-
-Let's see how we can use `Model2Model` to get the value of the loss associated with this (question, answer) pair:
-
-```python
-# In order to compute the loss we need to provide language model
-# labels (the token ids that the model should have produced) to
-# the decoder.
-lm_labels =  encoded_answer
-labels_tensor = torch.tensor([lm_labels])
-
-# Load pre-trained model (weights)
-model = Model2Model.from_pretrained('bert-base-uncased')
-
-# Set the model in evaluation mode to deactivate the DropOut modules
-# This is IMPORTANT to have reproducible results during evaluation!
-model.eval()
-
-# If you have a GPU, put everything on cuda
-question_tensor = question_tensor.to('cuda')
-answer_tensor = answer_tensor.to('cuda')
-labels_tensor = labels_tensor.to('cuda')
-model.to('cuda')
-
-# Predict hidden states features for each layer
-with torch.no_grad():
-    # See the models docstrings for the detail of the inputs
-    outputs = model(question_tensor, answer_tensor, decoder_lm_labels=labels_tensor)
-    # Transformers models always output tuples.
-    # See the models docstrings for the detail of all the outputs
-    # In our case, the first element is the value of the LM loss 
-    lm_loss = outputs[0]
-```
-
-This loss can be used to fine-tune `Model2Model` on the question answering task. Assuming that we fine-tuned the model, let us now see how to generate an answer:
-
-```python
-# Let's re-use the previous question
-question = "Who was Jim Henson?"
-encoded_question = tokenizer.encode(question)
-question_tensor = torch.tensor([encoded_question])
-
-# This time we try to generate the answer, so we start with an empty sequence
-answer = "[CLS]"
-encoded_answer = tokenizer.encode(answer, add_special_tokens=False)
-answer_tensor = torch.tensor([encoded_answer])
-
-# Load pre-trained model (weights)
-model = Model2Model.from_pretrained('fine-tuned-weights')
-model.eval()
-
-# If you have a GPU, put everything on cuda
-question_tensor = encoded_question.to('cuda')
-answer_tensor = encoded_answer.to('cuda')
-model.to('cuda')
-
-# Predict all tokens
-with torch.no_grad():
-    outputs = model(question_tensor, answer_tensor)
-    predictions = outputs[0]
-
-# confirm we were able to predict 'jim'
-predicted_index = torch.argmax(predictions[0, -1]).item()
-predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-assert predicted_token == 'jim'
-```
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -58,14 +58,14 @@ where

 ``Uncased`` means that the text has been lowercased before WordPiece tokenization, e.g., ``John Smith`` becomes ``john smith``. The Uncased model also strips out any accent markers. ``Cased`` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the `Multilingual README <https://github.com/google-research/bert/blob/master/multilingual.md>`__ or the original TensorFlow repository.

-When using an ``uncased model``\ , make sure to pass ``--do_lower_case`` to the example training scripts (or pass ``do_lower_case=True`` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).
+When using an ``uncased model``\ , make sure your tokenizer has ``do_lower_case=True`` (either in its configuration, or passed as an additional parameter).

 Examples:

 .. code-block:: python

   # BERT
-   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
+   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_basic_tokenize=True)
   model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

   # OpenAI GPT
@@ -140,13 +140,13 @@ Here is the recommended way of saving the model, configuration and vocabulary to

   torch.save(model_to_save.state_dict(), output_model_file)
   model_to_save.config.to_json_file(output_config_file)
-   tokenizer.save_vocabulary(output_dir)
+   tokenizer.save_pretrained(output_dir)

   # Step 2: Re-load the saved model and vocabulary

   # Example for a Bert model
   model = BertForQuestionAnswering.from_pretrained(output_dir)
-   tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case)  # Add specific options if needed
+   tokenizer = BertTokenizer.from_pretrained(output_dir)  # Add specific options if needed
   # Example for a GPT model
   model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
   tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -0,0 +1,829 @@
+Usage
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This page shows the most frequent use-cases when using the library. The models available allow for many different
+configurations and a great versatility in use-cases. The most simple ones are presented here, showcasing usage
+for tasks such as question answering, sequence classification, named entity recognition and others.
+
+These examples leverage auto-models, which are classes that will instantiate a model according to a given checkpoint,
+automatically selecting the correct model architecture. Please check the :class:`~transformers.AutoModel` documentation
+for more information.
+Feel free to modify the code to be more specific and adapt it to your specific use-case.
+
+In order for a model to perform well on a task, it must be loaded from a checkpoint corresponding to that task. These
+checkpoints are usually pre-trained on a large corpus of data and fine-tuned on a specific task. This means the
+following:
+
+- Not all models were fine-tuned on all tasks. If you want to fine-tune a model on a specific task, you can leverage
+  one of the `run_$TASK.py` script in the
+  `examples <https://github.com/huggingface/transformers/tree/master/examples>`_ directory.
+- Fine-tuned models were fine-tuned on a specific dataset. This dataset may or may not overlap with your use-case
+  and domain. As mentioned previously, you may leverage the
+  `examples <https://github.com/huggingface/transformers/tree/master/examples>`_ scripts to fine-tune your model, or you
+  may create your own training script.
+
+In order to do an inference on a task, several mechanisms are made available by the library:
+
+- Pipelines: very easy-to-use abstractions, which require as little as two lines of code.
+- Using a model directly with a tokenizer (PyTorch/TensorFlow): the full inference using the model. Less abstraction,
+  but much more powerful.
+
+Both approaches are showcased here.
+
+.. note::
+
+    All tasks presented here leverage pre-trained checkpoints that were fine-tuned on specific tasks. Loading a
+    checkpoint that was not fine-tuned on a specific task would load only the base transformer layers and not the
+    additional head that is used for the task, initializing the weights of that head randomly.
+
+    This would produce random output.
+
+Sequence Classification
+--------------------------
+
+Sequence classification is the task of classifying sequences according to a given number of classes. An example
+of sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune
+a model on a GLUE sequence classification task, you may leverage the
+`run_glue.py <https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_glue.py>`_ or
+`run_tf_glue.py <https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_tf_glue.py>`_ scripts.
+
+Here is an example using the pipelines do to sentiment analysis: identifying if a sequence is positive or negative.
+It leverages a fine-tuned model on sst2, which is a GLUE task.
+
+::
+
+    from transformers import pipeline
+
+    nlp = pipeline("sentiment-analysis")
+
+    print(nlp("I hate you"))
+    print(nlp("I love you"))
+
+This returns a label ("POSITIVE" or "NEGATIVE") alongside a score, as follows:
+
+::
+
+    [{'label': 'NEGATIVE', 'score': 0.9991129}]
+    [{'label': 'POSITIVE', 'score': 0.99986565}]
+
+
+Here is an example of doing a sequence classification using a model to determine if two sequences are paraphrases
+of each other. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
+  with the weights stored in the checkpoint.
+- Build a sequence from the two sentences, with the correct model-specific separators token type ids
+  and attention masks (:func:`~transformers.PreTrainedTokenizer.encode` and
+  :func:`~transformers.PreTrainedTokenizer.encode_plus` take care of this)
+- Pass this sequence through the model so that it is classified in one of the two available classes: 0
+  (not a paraphrase) and 1 (is a paraphrase)
+- Compute the softmax of the result to get probabilities over the classes
+- Print the results
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    import torch
+
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
+    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
+
+    classes = ["not paraphrase", "is paraphrase"]
+
+    sequence_0 = "The company HuggingFace is based in New York City"
+    sequence_1 = "Apples are especially bad for your health"
+    sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
+
+    paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="pt")
+    not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="pt")
+
+    paraphrase_classification_logits = model(**paraphrase)[0]
+    not_paraphrase_classification_logits = model(**not_paraphrase)[0]
+
+    paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
+    not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
+
+    print("Should be paraphrase")
+    for i in range(len(classes)):
+        print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")
+
+    print("\nShould not be paraphrase")
+    for i in range(len(classes)):
+        print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%")
+    ## TENSORFLOW CODE
+    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+    import tensorflow as tf
+
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
+    model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
+
+    classes = ["not paraphrase", "is paraphrase"]
+
+    sequence_0 = "The company HuggingFace is based in New York City"
+    sequence_1 = "Apples are especially bad for your health"
+    sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
+
+    paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="tf")
+    not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="tf")
+
+    paraphrase_classification_logits = model(paraphrase)[0]
+    not_paraphrase_classification_logits = model(not_paraphrase)[0]
+
+    paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
+    not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
+
+    print("Should be paraphrase")
+    for i in range(len(classes)):
+        print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")
+
+    print("\nShould not be paraphrase")
+    for i in range(len(classes)):
+        print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%")
+
+This outputs the following results:
+
+::
+
+    Should be paraphrase
+    not paraphrase: 10%
+    is paraphrase: 90%
+
+    Should not be paraphrase
+    not paraphrase: 94%
+    is paraphrase: 6%
+
+Extractive Question Answering
+----------------------------------------------------
+
+Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
+question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
+a model on a SQuAD task, you may leverage the `run_squad.py`.
+
+Here is an example using the pipelines do to question answering: extracting an answer from a text given a question.
+It leverages a fine-tuned model on SQuAD.
+
+::
+
+    from transformers import pipeline
+
+    nlp = pipeline("question-answering")
+
+    context = r"""
+    Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
+    question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
+    a model on a SQuAD task, you may leverage the `run_squad.py`.
+    """
+
+    print(nlp(question="What is extractive question answering?", context=context))
+    print(nlp(question="What is a good example of a question answering dataset?", context=context))
+
+This returns an answer extracted from the text, a confidence score, alongside "start" and "end" values which
+are the positions of the extracted answer in the text.
+
+::
+
+    {'score': 0.622232091629833, 'start': 34, 'end': 96, 'answer': 'the task of extracting an answer from a text given a question.'}
+    {'score': 0.5115299158662765, 'start': 147, 'end': 161, 'answer': 'SQuAD dataset,'}
+
+
+Here is an example of question answering using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
+  with the weights stored in the checkpoint.
+- Define a text and a few questions.
+- Iterate over the questions and build a sequence from the text and the current question, with the correct
+  model-specific separators token type ids and attention masks
+- Pass this sequence through the model. This outputs a range of scores across the entire sequence tokens (question and
+  text), for both the start and end positions.
+- Compute the softmax of the result to get probabilities over the tokens
+- Fetch the tokens from the identified start and stop values, convert those tokens to a string.
+- Print the results
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+    import torch
+
+    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+    model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+
+    text = r"""
+    🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
+    architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
+    Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+    TensorFlow 2.0 and PyTorch.
+    """
+
+    questions = [
+        "How many pretrained models are available in Transformers?",
+        "What does Transformers provide?",
+        "Transformers provides interoperability between which frameworks?",
+    ]
+
+    for question in questions:
+        inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
+        input_ids = inputs["input_ids"].tolist()[0]
+
+        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+        answer_start_scores, answer_end_scores = model(**inputs)
+
+        answer_start = torch.argmax(
+            answer_start_scores
+        )  # Get the most likely beginning of answer with the argmax of the score
+        answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
+
+        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
+
+        print(f"Question: {question}")
+        print(f"Answer: {answer}\n")
+    ## TENSORFLOW CODE
+    from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
+    import tensorflow as tf
+
+    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+    model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+
+    text = r"""
+    🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
+    architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
+    Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+    TensorFlow 2.0 and PyTorch.
+    """
+
+    questions = [
+        "How many pretrained models are available in Transformers?",
+        "What does Transformers provide?",
+        "Transformers provides interoperability between which frameworks?",
+    ]
+
+    for question in questions:
+        inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="tf")
+        input_ids = inputs["input_ids"].numpy()[0]
+
+        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+        answer_start_scores, answer_end_scores = model(inputs)
+
+        answer_start = tf.argmax(
+            answer_start_scores, axis=1
+        ).numpy()[0]  # Get the most likely beginning of answer with the argmax of the score
+        answer_end = (
+            tf.argmax(answer_end_scores, axis=1) + 1
+        ).numpy()[0]  # Get the most likely end of answer with the argmax of the score
+        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
+
+        print(f"Question: {question}")
+        print(f"Answer: {answer}\n")
+
+This outputs the questions followed by the predicted answers:
+
+::
+
+    Question: How many pretrained models are available in Transformers?
+    Answer: over 32 +
+
+    Question: What does Transformers provide?
+    Answer: general - purpose architectures
+
+    Question: Transformers provides interoperability between which frameworks?
+    Answer: tensorflow 2 . 0 and pytorch
+
+
+
+Language Modeling
+----------------------------------------------------
+
+Language modeling is the task of fitting a model to a corpus, which can be domain specific. All popular transformer
+based models are trained using a variant of language modeling, e.g. BERT with masked language modeling, GPT-2 with
+causal language modeling.
+
+Language modeling can be useful outside of pre-training as well, for example to shift the model distribution to be
+domain-specific: using a language model trained over a very large corpus, and then fine-tuning it to a news dataset
+or on scientific papers e.g. `LysandreJik/arxiv-nlp <https://huggingface.co/lysandre/arxiv-nlp>`__.
+
+Masked Language Modeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Masked language modeling is the task of masking tokens in a sequence with a masking token, and prompting the model to
+fill that mask with an appropriate token. This allows the model to attend to both the right context (tokens on the
+right of the mask) and the left context (tokens on the left of the mask). Such a training creates a strong basis
+for downstream tasks requiring bi-directional context such as SQuAD (question answering,
+see `Lewis, Lui, Goyal et al. <https://arxiv.org/abs/1910.13461>`__, part 4.2).
+
+Here is an example of using pipelines to replace a mask from a sequence:
+
+::
+
+    from transformers import pipeline
+
+    nlp = pipeline("fill-mask")
+    print(nlp(f"HuggingFace is creating a {nlp.tokenizer.mask_token} that the community uses to solve NLP tasks."))
+
+This outputs the sequences with the mask filled, the confidence score as well as the token id in the tokenizer
+vocabulary:
+
+::
+
+    [
+        {'sequence': '<s> HuggingFace is creating a tool that the community uses to solve NLP tasks.</s>', 'score': 0.15627853572368622, 'token': 3944},
+        {'sequence': '<s> HuggingFace is creating a framework that the community uses to solve NLP tasks.</s>', 'score': 0.11690319329500198, 'token': 7208},
+        {'sequence': '<s> HuggingFace is creating a library that the community uses to solve NLP tasks.</s>', 'score': 0.058063216507434845, 'token': 5560},
+        {'sequence': '<s> HuggingFace is creating a database that the community uses to solve NLP tasks.</s>', 'score': 0.04211743175983429, 'token': 8503},
+        {'sequence': '<s> HuggingFace is creating a prototype that the community uses to solve NLP tasks.</s>', 'score': 0.024718601256608963, 'token': 17715}
+    ]
+
+Here is an example doing masked language modeling using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a DistilBERT model and
+  loads it with the weights stored in the checkpoint.
+- Define a sequence with a masked token, placing the :obj:`tokenizer.mask_token` instead of a word.
+- Encode that sequence into IDs and find the position of the masked token in that list of IDs.
+- Retrieve the predictions at the index of the mask token: this tensor has the same size as the vocabulary, and the
+  values are the scores attributed to each token. The model gives higher score to tokens he deems probable in that
+  context.
+- Retrieve the top 5 tokens using the PyTorch :obj:`topk` or TensorFlow :obj:`top_k` methods.
+- Replace the mask token by the tokens and print the results
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelWithLMHead, AutoTokenizer
+    import torch
+
+    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+    model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+
+    sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+
+    input = tokenizer.encode(sequence, return_tensors="pt")
+    mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
+
+    token_logits = model(input)[0]
+    mask_token_logits = token_logits[0, mask_token_index, :]
+
+    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
+
+    for token in top_5_tokens:
+        print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelWithLMHead, AutoTokenizer
+    import tensorflow as tf
+
+    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+    model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+
+    sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+
+    input = tokenizer.encode(sequence, return_tensors="tf")
+    mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1]
+
+    token_logits = model(input)[0]
+    mask_token_logits = token_logits[0, mask_token_index, :]
+
+    top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
+
+    for token in top_5_tokens:
+        print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
+
+This prints five sequences, with the top 5 tokens predicted by the model:
+
+::
+
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
+
+
+Causal Language Modeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the
+model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting
+for generation tasks.
+
+Usually, the next token is predicted by sampling from the logits of the last hidden state the model produces from the input sequence.
+
+Here is an example using the tokenizer and model and leveraging the :func:`~transformers.PreTrainedModel.top_k_top_p_filtering` method to sample the next token following an input sequence of tokens.
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelWithLMHead, AutoTokenizer, top_k_top_p_filtering
+    import torch
+    from torch.nn import functional as F
+
+
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    model = AutoModelWithLMHead.from_pretrained("gpt2")
+
+    sequence = f"Hugging Face is based in DUMBO, New York City, and "
+
+    input_ids = tokenizer.encode(sequence, return_tensors="pt")
+
+    # get logits of last hidden state
+    next_token_logits = model(input_ids)[0][:, -1, :]
+
+    # filter
+    filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
+
+    # sample
+    probs = F.softmax(filtered_next_token_logits, dim=-1)
+    next_token = torch.multinomial(probs, num_samples=1)
+
+    generated = torch.cat([input_ids, next_token], dim=-1)
+
+    resulting_string = tokenizer.decode(generated.tolist()[0])
+    print(resulting_string)
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelWithLMHead, AutoTokenizer, tf_top_k_top_p_filtering
+    import tensorflow as tf
+
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    model = TFAutoModelWithLMHead.from_pretrained("gpt2")
+
+    sequence = f"Hugging Face is based in DUMBO, New York City, and "
+
+    input_ids = tokenizer.encode(sequence, return_tensors="tf")
+
+    # get logits of last hidden state
+    next_token_logits = model(input_ids)[0][:, -1, :]
+
+    # filter
+    filtered_next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
+
+    # sample
+    next_token = tf.random.categorical(filtered_next_token_logits, dtype=tf.int32, num_samples=1)
+
+    generated = tf.concat([input_ids, next_token], axis=1)
+
+    resulting_string = tokenizer.decode(generated.numpy().tolist()[0])
+    print(resulting_string)
+
+
+This outputs a (hopefully) coherent next token following the original sequence, which is in our case is the word *has*:
+
+::
+
+    Hugging Face is based in DUMBO, New York City, and has
+
+In the next section, we show how this functionality is leveraged in :func:`~transformers.PreTrainedModel.generate` to generate multiple tokens up to a user-defined length.
+
+Text Generation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In text generation (*a.k.a* *open-ended text generation*) the goal is to create a coherent portion of text that is a continuation from the given context. As an example, is it shown how *GPT-2* can be used in pipelines to generate text. As a default all models apply *Top-K* sampling when used in pipelines as configured in their respective configurations (see `gpt-2 config <https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json>`_ for example).
+
+::
+
+    from transformers import pipeline
+
+    text_generator = pipeline("text-generation")
+    print(text_generator("As far as I am concerned, I will", max_length=50))
+
+
+Here the model generates a random text with a total maximal length of *50* tokens from context *"As far as I am concerned, I will"*.
+The default arguments of ``PreTrainedModel.generate()`` can directly be overriden in the pipeline as is shown above for the argument ``max_length``.
+
+Here is an example for text generation using XLNet and its tokenzier. 
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelWithLMHead, AutoTokenizer
+
+    model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased")
+    tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
+
+    # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
+    PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
+    (except for Alexei and Maria) are discovered.
+    The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+    remainder of the story. 1883 Western Siberia,
+    a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+    Rasputin has a vision and denounces one of the men as a horse thief. Although his
+    father initially slaps him for making such an accusation, Rasputin watches as the
+    man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+    with people, even a bishop, begging for his blessing. <eod> </s> <eos>""" 
+
+    prompt = "Today the weather is really nice and I am planning on "
+    inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")
+    
+    prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
+    generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
+
+    print(generated)
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelWithLMHead, AutoTokenizer
+
+    model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased")
+    tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
+
+    # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
+    PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
+    (except for Alexei and Maria) are discovered.
+    The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+    remainder of the story. 1883 Western Siberia,
+    a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+    Rasputin has a vision and denounces one of the men as a horse thief. Although his
+    father initially slaps him for making such an accusation, Rasputin watches as the
+    man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+    with people, even a bishop, begging for his blessing. <eod> </s> <eos>""" 
+
+    prompt = "Today the weather is really nice and I am planning on "
+    inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf")
+
+    prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
+    generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
+
+    print(generated)
+
+Text generation is currently possible with *GPT-2*, *OpenAi-GPT*, *CTRL*, *XLNet*, *Transfo-XL* and *Reformer* in PyTorch and for most models in Tensorflow as well. As can be seen in the example above *XLNet* and *Transfo-xl* often need to be padded to work well.
+GPT-2 is usually a good choice for *open-ended text generation* because it was trained on millions on webpages with a causal language modeling objective.
+
+For more information on how to apply different decoding strategies for text generation, please also refer to our generation blog post `here <https://huggingface.co/blog/how-to-generate>`_.
+
+
+Named Entity Recognition
+----------------------------------------------------
+
+Named Entity Recognition (NER) is the task of classifying tokens according to a class, for example identifying a
+token as a person, an organisation or a location.
+An example of a named entity recognition dataset is the CoNLL-2003 dataset, which is entirely based on that task.
+If you would like to fine-tune a model on an NER task, you may leverage the `ner/run_ner.py` (PyTorch),
+`ner/run_pl_ner.py` (leveraging pytorch-lightning) or the `ner/run_tf_ner.py` (TensorFlow) scripts.
+
+Here is an example using the pipelines do to named entity recognition, trying to identify tokens as belonging to one
+of 9 classes:
+
+- O, Outside of a named entity
+- B-MIS, Beginning of a miscellaneous entity right after another miscellaneous entity
+- I-MIS, Miscellaneous entity
+- B-PER, Beginning of a person's name right after another person's name
+- I-PER, Person's name
+- B-ORG, Beginning of an organisation right after another organisation
+- I-ORG, Organisation
+- B-LOC, Beginning of a location right after another location
+- I-LOC, Location
+
+It leverages a fine-tuned model on CoNLL-2003, fine-tuned by `@stefan-it <https://github.com/stefan-it>`__ from
+`dbmdz <https://github.com/dbmdz>`__.
+
+::
+
+    from transformers import pipeline
+
+    nlp = pipeline("ner")
+
+    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+               "close to the Manhattan Bridge which is visible from the window."
+
+    print(nlp(sequence))
+
+This outputs a list of all words that have been identified as an entity from the 9 classes defined above. Here is the
+expected results:
+
+::
+
+    [
+        {'word': 'Hu', 'score': 0.9995632767677307, 'entity': 'I-ORG'},
+        {'word': '##gging', 'score': 0.9915938973426819, 'entity': 'I-ORG'},
+        {'word': 'Face', 'score': 0.9982671737670898, 'entity': 'I-ORG'},
+        {'word': 'Inc', 'score': 0.9994403719902039, 'entity': 'I-ORG'},
+        {'word': 'New', 'score': 0.9994346499443054, 'entity': 'I-LOC'},
+        {'word': 'York', 'score': 0.9993270635604858, 'entity': 'I-LOC'},
+        {'word': 'City', 'score': 0.9993864893913269, 'entity': 'I-LOC'},
+        {'word': 'D', 'score': 0.9825621843338013, 'entity': 'I-LOC'},
+        {'word': '##UM', 'score': 0.936983048915863, 'entity': 'I-LOC'},
+        {'word': '##BO', 'score': 0.8987102508544922, 'entity': 'I-LOC'},
+        {'word': 'Manhattan', 'score': 0.9758241176605225, 'entity': 'I-LOC'},
+        {'word': 'Bridge', 'score': 0.990249514579773, 'entity': 'I-LOC'}
+    ]
+
+Note how the words "Hugging Face" have been identified as an organisation, and "New York City", "DUMBO" and
+"Manhattan Bridge" have been identified as locations.
+
+Here is an example doing named entity recognition using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and
+  loads it with the weights stored in the checkpoint.
+- Define the label list with which the model was trained on.
+- Define a sequence with known entities, such as "Hugging Face" as an organisation and "New York City" as a location.
+- Split words into tokens so that they can be mapped to the predictions. We use a small hack by firstly completely
+  encoding and decoding the sequence, so that we're left with a string that contains the special tokens.
+- Encode that sequence into IDs (special tokens are added automatically).
+- Retrieve the predictions by passing the input to the model and getting the first output. This results in a
+  distribution over the 9 possible classes for each token. We take the argmax to retrieve the most likely class
+  for each token.
+- Zip together each token with its prediction and print it.
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelForTokenClassification, AutoTokenizer
+    import torch
+
+    model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+    label_list = [
+        "O",       # Outside of a named entity
+        "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
+        "I-MISC",  # Miscellaneous entity
+        "B-PER",   # Beginning of a person's name right after another person's name
+        "I-PER",   # Person's name
+        "B-ORG",   # Beginning of an organisation right after another organisation
+        "I-ORG",   # Organisation
+        "B-LOC",   # Beginning of a location right after another location
+        "I-LOC"    # Location
+    ]
+
+    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+               "close to the Manhattan Bridge."
+
+    # Bit of a hack to get the tokens with the special tokens
+    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
+    inputs = tokenizer.encode(sequence, return_tensors="pt")
+
+    outputs = model(inputs)[0]
+    predictions = torch.argmax(outputs, dim=2)
+
+    print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].tolist())])
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelForTokenClassification, AutoTokenizer
+    import tensorflow as tf
+
+    model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+    label_list = [
+        "O",       # Outside of a named entity
+        "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
+        "I-MISC",  # Miscellaneous entity
+        "B-PER",   # Beginning of a person's name right after another person's name
+        "I-PER",   # Person's name
+        "B-ORG",   # Beginning of an organisation right after another organisation
+        "I-ORG",   # Organisation
+        "B-LOC",   # Beginning of a location right after another location
+        "I-LOC"    # Location
+    ]
+
+    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+               "close to the Manhattan Bridge."
+
+    # Bit of a hack to get the tokens with the special tokens
+    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
+    inputs = tokenizer.encode(sequence, return_tensors="tf")
+
+    outputs = model(inputs)[0]
+    predictions = tf.argmax(outputs, axis=2)
+
+    print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())])
+
+This outputs a list of each token mapped to their prediction. Differently from the pipeline, here every token has
+a prediction as we didn't remove the "0" class which means that no particular entity was found on that token. The
+following array should be the output:
+
+::
+
+    [('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]   
+Summarization
+----------------------------------------------------
+
+Summarization is the task of summarizing a text / an article into a shorter text.
+
+An example of a summarization dataset is the CNN / Daily Mail dataset, which consists of long news articles and was created for the task of summarization.
+If you would like to fine-tune a model on a summarization task, you may leverage the ``examples/summarization/bart/run_train.sh`` (leveraging pytorch-lightning) script.
+
+Here is an example using the pipelines do to summarization. 
+It leverages a Bart model that was fine-tuned on the CNN / Daily Mail data set.
+
+::
+
+    from transformers import pipeline
+
+    summarizer = pipeline("summarization")
+
+    ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. 
+    A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. 
+    Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. 
+    In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. 
+    Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 
+    2010 marriage license application, according to court documents. 
+    Prosecutors said the marriages were part of an immigration scam. 
+    On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. 
+    After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective 
+    Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. 
+    All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. 
+    Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages. 
+    Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. 
+    The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s 
+    Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. 
+    Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. 
+    If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.
+    """
+    
+    print(summarizer(ARTICLE, max_length=130, min_length=30))
+
+Because the summarization pipeline depends on the ``PretrainedModel.generate()`` method, we can override the default arguments 
+of ``PretrainedModel.generate()`` directly in the pipeline as is shown for ``max_length`` and ``min_length`` above.
+This outputs the following summary:
+
+::
+
+  Liana Barrientos has been married 10 times, sometimes within two weeks of each other. Prosecutors say the marriages were part of an immigration scam. She pleaded not guilty at State Supreme Court in the Bronx on Friday.
+  
+Here is an example doing summarization using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder model, such as ``Bart`` or ``T5``.
+- Define the article that should be summarizaed.
+- Leverage the ``PretrainedModel.generate()`` method.
+- Add the T5 specific prefix "summarize: ".
+
+Here Google`s T5 model is used that was only pre-trained on a multi-task mixed data set (including CNN / Daily Mail), but nevertheless yields very good results.
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelWithLMHead, AutoTokenizer
+
+    model = AutoModelWithLMHead.from_pretrained("t5-base")
+    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    # T5 uses a max_length of 512 so we cut the article to 512 tokens.
+    inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="pt", max_length=512)
+    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
+    print(outputs)
+    
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelWithLMHead, AutoTokenizer
+
+    model = TFAutoModelWithLMHead.from_pretrained("t5-base")
+    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    # T5 uses a max_length of 512 so we cut the article to 512 tokens.
+    inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="tf", max_length=512)
+    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
+    print(outputs)  
+Translation
+----------------------------------------------------
+
+Translation is the task of translating a text from one language to another.
+
+An example of a translation dataset is the WMT English to German dataset, which has English sentences as the input data 
+and German sentences as the target data.
+
+Here is an example using the pipelines do to translation. 
+It leverages a T5 model that was only pre-trained on a multi-task mixture dataset (including WMT), but yields impressive 
+translation results nevertheless.
+
+::
+
+    from transformers import pipeline
+
+    translator = pipeline("translation_en_to_de")
+    print(translator("Hugging Face is a technology company based in New York and Paris", max_length=40))
+
+Because the translation pipeline depends on the ``PretrainedModel.generate()`` method, we can override the default arguments 
+of ``PretrainedModel.generate()`` directly in the pipeline as is shown for ``max_length`` above.
+This outputs the following translation into German:
+
+::
+
+  Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.
+  
+Here is an example doing translation using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder model, such as ``Bart`` or ``T5``.
+- Define the article that should be summarizaed.
+- Leverage the ``PretrainedModel.generate()`` method.
+- Add the T5 specific prefix "translate English to German: "
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelWithLMHead, AutoTokenizer
+
+    model = AutoModelWithLMHead.from_pretrained("t5-base")
+    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="pt")
+    outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
+
+    print(outputs)
+    
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelWithLMHead, AutoTokenizer
+
+    model = TFAutoModelWithLMHead.from_pretrained("t5-base")
+    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="tf")
+    outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
+
+    print(outputs)
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,10 +1,41 @@
-# Examples
+## Examples

-In this section a few examples are put together. All of these examples work for several models, making use of the very
-similar API between the different models.
+Version 2.9 of `transformers` introduces a new [`Trainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py) class for PyTorch, and its equivalent [`TFTrainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer_tf.py) for TF 2.
+Running the examples requires PyTorch 1.3.1+ or TensorFlow 2.0+.

-**Important**  
-To run the latest versions of the examples, you have to install from source and install some specific requirements for the examples.
+Here is the list of all our examples:
+- **grouped by task** (all official examples work for multiple models)
+- with information on whether they are **built on top of `Trainer`/`TFTrainer`** (if not, they still work, they might just lack some features),
+- whether they also include examples for **`pytorch-lightning`**, which is a great fully-featured, general-purpose training library for PyTorch,
+- links to **Colab notebooks** to walk through the scripts and run them easily,
+- links to **Cloud deployments** to be able to deploy large-scale trainings in the Cloud with little to no setup.
+
+This is still a work-in-progress – in particular documentation is still sparse – so please **contribute improvements/pull requests.**
+
+
+# The Big Table of Tasks
+
+| Task | Example datasets | Trainer support | TFTrainer support | pytorch-lightning | Colab
+|---|---|:---:|:---:|:---:|:---:|
+| [**`language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/language-modeling)       | Raw text        | ✅ | -  | -  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)
+| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/text-classification)   | GLUE, XNLI      | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/trainer/01_text_classification.ipynb)
+| [**`token-classification`**](https://github.com/huggingface/transformers/tree/master/examples/token-classification) | CoNLL NER       | ✅ | ✅ | ✅ | -
+| [**`multiple-choice`**](https://github.com/huggingface/transformers/tree/master/examples/multiple-choice)           | SWAG, RACE, ARC | ✅ | ✅ | -  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ViktorAlm/notebooks/blob/master/MPC_GPU_Demo_for_TF_and_PT.ipynb)
+| [**`question-answering`**](https://github.com/huggingface/transformers/tree/master/examples/question-answering)     | SQuAD           | -  | ✅ | -  | -
+| [**`text-generation`**](https://github.com/huggingface/transformers/tree/master/examples/text-generation)     | -           | -  | - | -  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)
+| [**`distillation`**](https://github.com/huggingface/transformers/tree/master/examples/distillation)       | All               | -  | -  | -  | -
+| [**`summarization`**](https://github.com/huggingface/transformers/tree/master/examples/summarization)     | CNN/Daily Mail    | -  | -  | -  | -
+| [**`translation`**](https://github.com/huggingface/transformers/tree/master/examples/translation)         | WMT               | -  | -  | -  | -
+| [**`bertology`**](https://github.com/huggingface/transformers/tree/master/examples/bertology)             | -                 | -  | -  | -  | -
+| [**`adversarial`**](https://github.com/huggingface/transformers/tree/master/examples/adversarial)         | HANS              | -  | -  | -  | -
+
+
+<br>
+
+## Important note
+
+**Important**
+To make sure you can successfully run the latest versions of the example scripts, you have to install the library from source and install some example-specific requirements.
 Execute the following steps in a new virtual environment:

 ```bash
@@ -14,788 +45,36 @@ pip install .
 pip install -r ./examples/requirements.txt
 ```

-| Section                    | Description                                                                                                                                                |
-|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| [TensorFlow 2.0 models on GLUE](#TensorFlow-2.0-Bert-models-on-GLUE) | Examples running BERT TensorFlow 2.0 model on the GLUE tasks. 
-| [Language Model fine-tuning](#language-model-fine-tuning) | Fine-tuning the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. |
-| [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.                                         |
-| [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision.                              |
-| [SQuAD](#squad) | Using BERT/RoBERTa/XLNet/XLM for question answering, examples with distributed training.                                                                                  |
-| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
-| [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training.                                                                                  |
-| [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
-| [Adversarial evaluation of model performances](#adversarial-evaluation-of-model-performances) | Testing a model with adversarial evaluation of natural language
-inference on the Heuristic Analysis for NLI Systems (HANS) dataset (McCoy et al., 2019.) |
+## One-click Deploy to Cloud (wip)

-## TensorFlow 2.0 Bert models on GLUE
+#### Azure

-Based on the script [`run_tf_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_glue.py).
+[![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2FAzure%2Fazure-quickstart-templates%2Fmaster%2F101-storage-account-create%2Fazuredeploy.json)

-Fine-tuning the library TensorFlow 2.0 Bert model for sequence classification on the  MRPC task of the GLUE benchmark: [General Language Understanding Evaluation](https://gluebenchmark.com/).
+## Running on TPUs

-This script has an option for mixed precision (Automatic Mixed Precision / AMP) to run models on Tensor Cores (NVIDIA Volta/Turing GPUs) and future hardware and an option for XLA, which uses the XLA compiler to reduce model runtime.
-Options are toggled using `USE_XLA` or `USE_AMP` variables in the script.
-These options and the below benchmark are provided by @tlkh.
+When using Tensorflow, TPUs are supported out of the box as a `tf.distribute.Strategy`.

-Quick benchmarks from the script (no other modifications):
+When using PyTorch, we support TPUs thanks to `pytorch/xla`. For more context and information on how to setup your TPU environment refer to Google's documentation and to the
+very detailed [pytorch/xla README](https://github.com/pytorch/xla/blob/master/README.md).

-| GPU    | Mode | Time (2nd epoch) | Val Acc (3 runs) |
-| --------- | -------- | ----------------------- | ----------------------|
-| Titan V | FP32 | 41s | 0.8438/0.8281/0.8333 |
-| Titan V | AMP | 26s | 0.8281/0.8568/0.8411 |
-| V100    | FP32 | 35s | 0.8646/0.8359/0.8464 |
-| V100    | AMP | 22s | 0.8646/0.8385/0.8411 |
-| 1080 Ti | FP32 | 55s | - |
+In this repo, we provide a very simple launcher script named [xla_spawn.py](https://github.com/huggingface/transformers/tree/master/examples/xla_spawn.py) that lets you run our example scripts on multiple TPU cores without any boilerplate.
+Just pass a `--num_cores` flag to this script, then your regular training script with its arguments (this is similar to the `torch.distributed.launch` helper for torch.distributed).

-Mixed precision (AMP) reduces the training time considerably for the same hardware and hyper-parameters (same batch size was used).
-
-## Language model fine-tuning
-
-Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_lm_finetuning.py).
-
-Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT 
-to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa 
-are fine-tuned using a masked language modeling (MLM) loss.
-
-Before running the following example, you should get a file that contains text on which the language model will be
-fine-tuned. A good example of such text is the [WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
-
-We will refer to two different files: `$TRAIN_FILE`, which contains text for training, and `$TEST_FILE`, which contains
-text that will be used for evaluation.
-
-### GPT-2/GPT and causal language modeling
-
-The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before
-the tokenization). The loss here is that of causal language modeling.
+For example for `run_glue`:

 ```bash
-export TRAIN_FILE=/path/to/dataset/wiki.train.raw
-export TEST_FILE=/path/to/dataset/wiki.test.raw
-
-python run_lm_finetuning.py \
-    --output_dir=output \
-    --model_type=gpt2 \
-    --model_name_or_path=gpt2 \
-    --do_train \
-    --train_data_file=$TRAIN_FILE \
-    --do_eval \
-    --eval_data_file=$TEST_FILE
+python examples/xla_spawn.py --num_cores 8 \
+	examples/text-classification/run_glue.py
+	--model_name_or_path bert-base-cased \
+	--task_name mnli \
+	--data_dir ./data/glue_data/MNLI \
+	--output_dir ./models/tpu \
+	--overwrite_output_dir \
+	--do_train \
+	--do_eval \
+	--num_train_epochs 1 \
+	--save_steps 20000
 ```

-This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run. It reaches
-a score of ~20 perplexity once fine-tuned on the dataset.
-
-### RoBERTa/BERT and masked language modeling
-
-The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different
-as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their
-pre-training: masked language modeling. 
-
-In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may, therefore, converge
-slightly slower (over-fitting takes more epochs).
-
-We use the `--mlm` flag so that the script may change its loss function.
-
-```bash
-export TRAIN_FILE=/path/to/dataset/wiki.train.raw
-export TEST_FILE=/path/to/dataset/wiki.test.raw
-
-python run_lm_finetuning.py \
-    --output_dir=output \
-    --model_type=roberta \
-    --model_name_or_path=roberta-base \
-    --do_train \
-    --train_data_file=$TRAIN_FILE \
-    --do_eval \
-    --eval_data_file=$TEST_FILE \
-    --mlm
-```
-
-## Language generation
-
-Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py).
-
-Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL.
-A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
-can try out the different models available in the library.
-
-Example usage:
-
-```bash
-python run_generation.py \
-    --model_type=gpt2 \
-    --model_name_or_path=gpt2
-```
-
-## GLUE
-
-Based on the script [`run_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py).
-
-Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding 
-Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa. 
-
-GLUE is made up of a total of 9 different tasks. We get the following results on the dev set of the benchmark with an
-uncased  BERT base model (the checkpoint `bert-base-uncased`). All experiments ran single V100 GPUs with a total train
-batch sizes between 16 and 64. Some of these tasks have a small dataset and training can lead to high variance in the results
-between different runs. We report the median on 5 runs (with different seeds) for each of the metrics.
-
-| Task  | Metric                       | Result      |
-|-------|------------------------------|-------------|
-| CoLA  | Matthew's corr               | 49.23       |
-| SST-2 | Accuracy                     | 91.97       |
-| MRPC  | F1/Accuracy                  | 89.47/85.29 |
-| STS-B | Person/Spearman corr.        | 83.95/83.70 |
-| QQP   | Accuracy/F1                  | 88.40/84.31 |
-| MNLI  | Matched acc./Mismatched acc. | 80.61/81.08 |
-| QNLI  | Accuracy                     | 87.46       |
-| RTE   | Accuracy                     | 61.73       |
-| WNLI  | Accuracy                     | 45.07       |
-
-Some of these results are significantly different from the ones reported on the test set
-of GLUE benchmark on the website. For QQP and WNLI, please refer to [FAQ #12](https://gluebenchmark.com/faq) on the webite.
-
-Before running anyone of these GLUE tasks you should download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
-
-```bash
-export GLUE_DIR=/path/to/glue
-export TASK_NAME=MRPC
-
-python run_glue.py \
-  --model_type bert \
-  --model_name_or_path bert-base-cased \
-  --task_name $TASK_NAME \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $GLUE_DIR/$TASK_NAME \
-  --max_seq_length 128 \
-  --per_gpu_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --output_dir /tmp/$TASK_NAME/
-```
-
-where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
-
-The dev set results will be present within the text file `eval_results.txt` in the specified output_dir. 
-In case of MNLI, since there are two separate dev sets (matched and mismatched), there will be a separate 
-output folder called `/tmp/MNLI-MM/` in addition to `/tmp/MNLI/`.
-
-The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI, 
-CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being 
-said, there shouldn’t be any issues in running half-precision training with the remaining GLUE tasks as well, 
-since the data processor for each task inherits from the base class DataProcessor.
-
-### MRPC
-
-#### Fine-tuning example
-
-The following examples fine-tune BERT on the Microsoft Research Paraphrase Corpus (MRPC) corpus and runs in less 
-than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
-
-Before running anyone of these GLUE tasks you should download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
-
-```bash
-export GLUE_DIR=/path/to/glue
-
-python run_glue.py \
-  --model_type bert \
-  --model_name_or_path bert-base-cased \
-  --task_name MRPC \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $GLUE_DIR/MRPC/ \
-  --max_seq_length 128 \
-  --per_gpu_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --output_dir /tmp/mrpc_output/
-```
-
-Our test ran on a few seeds with [the original implementation hyper-
-parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation 
-results between 84% and 88%.
-
-#### Using Apex and mixed-precision
-
-Using Apex and 16 bit precision, the fine-tuning on MRPC only takes 27 seconds. First install 
-[apex](https://github.com/NVIDIA/apex), then run the following example:
-
-```bash
-export GLUE_DIR=/path/to/glue
-
-python run_glue.py \
-  --model_type bert \
-  --model_name_or_path bert-base-cased \
-  --task_name MRPC \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $GLUE_DIR/MRPC/ \
-  --max_seq_length 128 \
-  --per_gpu_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --output_dir /tmp/mrpc_output/ \
-  --fp16
-```
-
-#### Distributed training
-
-Here is an example using distributed training on 8 V100 GPUs. The model used is the BERT whole-word-masking and it
-reaches F1 > 92 on MRPC.
-
-```bash
-export GLUE_DIR=/path/to/glue
-
-python -m torch.distributed.launch \
-    --nproc_per_node 8 run_glue.py \
-    --model_type bert \
-    --model_name_or_path bert-base-cased \
-    --task_name MRPC \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --data_dir $GLUE_DIR/MRPC/ \
-    --max_seq_length 128 \
-    --per_gpu_train_batch_size 8 \
-    --learning_rate 2e-5 \
-    --num_train_epochs 3.0 \
-    --output_dir /tmp/mrpc_output/
-```
-
-Training with these hyper-parameters gave us the following results:
-
-```bash
-acc = 0.8823529411764706
-acc_and_f1 = 0.901702786377709
-eval_loss = 0.3418912578906332
-f1 = 0.9210526315789473
-global_step = 174
-loss = 0.07231863956341798
-```
-
-### MNLI
-
-The following example uses the BERT-large, uncased, whole-word-masking model and fine-tunes it on the MNLI task.
-
-```bash
-export GLUE_DIR=/path/to/glue
-
-python -m torch.distributed.launch \
-    --nproc_per_node 8 run_glue.py \
-    --model_type bert \
-    --model_name_or_path bert-base-cased \
-    --task_name mnli \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --data_dir $GLUE_DIR/MNLI/ \
-    --max_seq_length 128 \
-    --per_gpu_train_batch_size 8 \
-    --learning_rate 2e-5 \
-    --num_train_epochs 3.0 \
-    --output_dir output_dir \
-```
-
-The results  are the following:
-
-```bash
-***** Eval results *****
-  acc = 0.8679706601466992
-  eval_loss = 0.4911287787382479
-  global_step = 18408
-  loss = 0.04755385363816904
-
-***** Eval results *****
-  acc = 0.8747965825874695
-  eval_loss = 0.45516540421714036
-  global_step = 18408
-  loss = 0.04755385363816904
-```
-
-## Multiple Choice
-
-Based on the script [`run_multiple_choice.py`]().
-
-#### Fine-tuning on SWAG
-Download [swag](https://github.com/rowanz/swagaf/tree/master/data) data
-
-```bash
-#training on 4 tesla V100(16GB) GPUS
-export SWAG_DIR=/path/to/swag_data_dir
-python ./examples/run_multiple_choice.py \
--model_type roberta \
--task_name swag \
--model_name_or_path roberta-base \
--do_train \
--do_eval \
--do_lower_case \
--data_dir $SWAG_DIR \
--learning_rate 5e-5 \
--num_train_epochs 3 \
--max_seq_length 80 \
--output_dir models_bert/swag_base \
--per_gpu_eval_batch_size=16 \
--per_gpu_train_batch_size=16 \
--gradient_accumulation_steps 2 \
--overwrite_output
-```
-Training with the defined hyper-parameters yields the following results:
-```
-***** Eval results *****
-eval_acc = 0.8338998300509847
-eval_loss = 0.44457291918821606
-```
-
-## SQuAD
-
-Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py).
-
-#### Fine-tuning BERT on SQuAD1.0
-
-This example code fine-tunes BERT on the SQuAD1.0 dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) 
-on a single tesla V100 16GB. The data for SQuAD can be downloaded with the following links and should be saved in a 
-$SQUAD_DIR directory.
-
-* [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
-* [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
-* [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
-
-And for SQuAD2.0, you need to download:
-
- [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
- [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
- [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
-
-```bash
-export SQUAD_DIR=/path/to/SQUAD
-
-python run_squad.py \
-  --model_type bert \
-  --model_name_or_path bert-base-cased \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --train_file $SQUAD_DIR/train-v1.1.json \
-  --predict_file $SQUAD_DIR/dev-v1.1.json \
-  --per_gpu_train_batch_size 12 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2.0 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/debug_squad/
-```
-
-Training with the previously defined hyper-parameters yields the following results:
-
-```bash
-f1 = 88.52
-exact_match = 81.22
-```
-
-#### Distributed training
-
-
-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \
-    --model_type bert \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --train_file $SQUAD_DIR/train-v1.1.json \
-    --predict_file $SQUAD_DIR/dev-v1.1.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ./examples/models/wwm_uncased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=3   \
-    --per_gpu_train_batch_size=3   \
-```
-
-Training with the previously defined hyper-parameters yields the following results:
-
-```bash
-f1 = 93.15
-exact_match = 86.91
-```
-
-This fine-tuned model is available as a checkpoint under the reference
-`bert-large-uncased-whole-word-masking-finetuned-squad`.
-
-#### Fine-tuning XLNet on SQuAD
-
-This example code fine-tunes XLNet on both SQuAD1.0 and SQuAD2.0 dataset. See above to download the data for SQuAD .
-
-##### Command for SQuAD1.0:
-
-```bash
-export SQUAD_DIR=/path/to/SQUAD
-
-python /data/home/hlu/transformers/examples/run_squad.py \
-    --model_type xlnet \
-    --model_name_or_path xlnet-large-cased \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --train_file /data/home/hlu/notebooks/NLP/examples/question_answering/train-v1.1.json \
-    --predict_file /data/home/hlu/notebooks/NLP/examples/question_answering/dev-v1.1.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ./wwm_cased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=4  \
-    --per_gpu_train_batch_size=4   \
-    --save_steps 5000
-```
-
-##### Command for SQuAD2.0:
-
-```bash
-export SQUAD_DIR=/path/to/SQUAD
-
-python run_squad.py \
-    --model_type xlnet \
-    --model_name_or_path xlnet-large-cased \
-    --do_train \
-    --do_eval \
-    --version_2_with_negative \
-    --train_file $SQUAD_DIR/train-v2.0.json \
-    --predict_file $SQUAD_DIR/dev-v2.0.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 4 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ./wwm_cased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=2  \
-    --per_gpu_train_batch_size=2   \
-    --save_steps 5000
-```
-
-Larger batch size may improve the performance while costing more memory.
-
-##### Results for SQuAD1.0 with the previously defined hyper-parameters:
-
-```python
-{
-"exact": 85.45884578997162,
-"f1": 92.5974600601065,
-"total": 10570,
-"HasAns_exact": 85.45884578997162,
-"HasAns_f1": 92.59746006010651,
-"HasAns_total": 10570
-}
-```
-
-##### Results for SQuAD2.0 with the previously defined hyper-parameters:
-
-```python
-{
-"exact": 80.4177545691906,
-"f1": 84.07154997729623,
-"total": 11873,
-"HasAns_exact": 76.73751686909581,
-"HasAns_f1": 84.05558584352873,
-"HasAns_total": 5928,
-"NoAns_exact": 84.0874684608915,
-"NoAns_f1": 84.0874684608915,
-"NoAns_total": 5945
-}
-```
-
-
-
-## Named Entity Recognition
-
-Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) for Pytorch and
-[`run_tf_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py) for Tensorflow 2.
-This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
-Details and results for the fine-tuning provided by @stefan-it.
-
-### Data (Download and pre-processing steps)
-
-Data can be obtained from the [GermEval 2014](https://sites.google.com/site/germeval2014ner/data) shared task page.
-
-Here are the commands for downloading and pre-processing train, dev and test datasets. The original data format has four (tab-separated) columns, in a pre-processing step only the two relevant columns (token and outer span NER annotation) are extracted:
-
-```bash
-curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
-curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attredirects=0&d=1' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
-curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
-```
-
-The GermEval 2014 dataset contains some strange "control character" tokens like `'\x96', '\u200e', '\x95', '\xad' or '\x80'`. One problem with these tokens is, that `BertTokenizer` returns an empty token for them, resulting in misaligned `InputExample`s. I wrote a script that a) filters these tokens and b) splits longer sentences into smaller ones (once the max. subtoken length is reached).
-
-```bash
-wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py"
-```
-Let's define some variables that we need for further pre-processing steps and training the model:
-
-```bash
-export MAX_LENGTH=128
-export BERT_MODEL=bert-base-multilingual-cased
-```
-
-Run the pre-processing script on training, dev and test datasets:
-
-```bash
-python3 preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
-python3 preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
-python3 preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
-```
-
-The GermEval 2014 dataset has much more labels than CoNLL-2002/2003 datasets, so an own set of labels must be used:
-
-```bash
-cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
-```
-
-### Prepare the run
-
-Additional environment variables must be set:
-
-```bash
-export OUTPUT_DIR=germeval-model
-export BATCH_SIZE=32
-export NUM_EPOCHS=3
-export SAVE_STEPS=750
-export SEED=1
-```
-
-### Run the Pytorch version
-
-To start training, just run:
-
-```bash
-python3 run_ner.py --data_dir ./ \
--model_type bert \
--labels ./labels.txt \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--per_gpu_train_batch_size $BATCH_SIZE \
--save_steps $SAVE_STEPS \
--seed $SEED \
--do_train \
--do_eval \
--do_predict
-```
-
-If your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
-
-#### Evaluation
-
-Evaluation on development dataset outputs the following for our example:
-
-```bash
-10/04/2019 00:42:06 - INFO - __main__ -   ***** Eval results  *****
-10/04/2019 00:42:06 - INFO - __main__ -     f1 = 0.8623348017621146
-10/04/2019 00:42:06 - INFO - __main__ -     loss = 0.07183869666975543
-10/04/2019 00:42:06 - INFO - __main__ -     precision = 0.8467916366258111
-10/04/2019 00:42:06 - INFO - __main__ -     recall = 0.8784592370979806
-```
-
-On the test dataset the following results could be achieved:
-
-```bash
-10/04/2019 00:42:42 - INFO - __main__ -   ***** Eval results  *****
-10/04/2019 00:42:42 - INFO - __main__ -     f1 = 0.8614389652384803
-10/04/2019 00:42:42 - INFO - __main__ -     loss = 0.07064602487454782
-10/04/2019 00:42:42 - INFO - __main__ -     precision = 0.8604651162790697
-10/04/2019 00:42:42 - INFO - __main__ -     recall = 0.8624150210424085
-```
-
-#### Comparing BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased)
-
-Here is a small comparison between BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased) with the same hyperparameters as specified in the [example documentation](https://huggingface.co/transformers/examples.html#named-entity-recognition) (one run):
-
-| Model | F-Score Dev | F-Score Test
-| --------------------------------- | ------- | --------
-| `bert-large-cased`            | 95.59 | 91.70
-| `roberta-large`                  | 95.96 | 91.87
-| `distilbert-base-uncased` | 94.34 | 90.32
-
-### Run the Tensorflow 2 version
-
-To start training, just run:
-
-```bash
-python3 run_tf_ner.py --data_dir ./ \
--model_type bert \
--labels ./labels.txt \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--per_device_train_batch_size $BATCH_SIZE \
--save_steps $SAVE_STEPS \
--seed $SEED \
--do_train \
--do_eval \
--do_predict
-```
-
-Such as the Pytorch version, if your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
-
-#### Evaluation
-
-Evaluation on development dataset outputs the following for our example:
-```bash
-           precision    recall  f1-score   support
-
- LOCderiv     0.7619    0.6154    0.6809        52
-  PERpart     0.8724    0.8997    0.8858      4057
-  OTHpart     0.9360    0.9466    0.9413       711
-  ORGpart     0.7015    0.6989    0.7002       269
-  LOCpart     0.7668    0.8488    0.8057       496
-      LOC     0.8745    0.9191    0.8963       235
- ORGderiv     0.7723    0.8571    0.8125        91
- OTHderiv     0.4800    0.6667    0.5581        18
-      OTH     0.5789    0.6875    0.6286        16
- PERderiv     0.5385    0.3889    0.4516        18
-      PER     0.5000    0.5000    0.5000         2
-      ORG     0.0000    0.0000    0.0000         3
-
-micro avg     0.8574    0.8862    0.8715      5968
-macro avg     0.8575    0.8862    0.8713      5968
-```
-
-On the test dataset the following results could be achieved:
-```bash
-           precision    recall  f1-score   support
-
-  PERpart     0.8847    0.8944    0.8896      9397
-  OTHpart     0.9376    0.9353    0.9365      1639
-  ORGpart     0.7307    0.7044    0.7173       697
-      LOC     0.9133    0.9394    0.9262       561
-  LOCpart     0.8058    0.8157    0.8107      1150
-      ORG     0.0000    0.0000    0.0000         8
- OTHderiv     0.5882    0.4762    0.5263        42
- PERderiv     0.6571    0.5227    0.5823        44
-      OTH     0.4906    0.6667    0.5652        39
- ORGderiv     0.7016    0.7791    0.7383       172
- LOCderiv     0.8256    0.6514    0.7282       109
-      PER     0.0000    0.0000    0.0000        11
-
-micro avg     0.8722    0.8774    0.8748     13869
-macro avg     0.8712    0.8774    0.8740     13869
-```
-
-## XNLI
-
-Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/blob/master/examples/run_xnli.py).
-
-[XNLI](https://www.nyu.edu/projects/bowman/xnli/) is crowd-sourced dataset based on [MultiNLI](http://www.nyu.edu/projects/bowman/multinli/). It is an evaluation benchmark for cross-lingual text representations. Pairs of text are labeled with textual entailment annotations for 15 different languages (including both high-ressource language such as English and low-ressource languages such as Swahili).
-
-#### Fine-tuning on XNLI
-
-This example code fine-tunes mBERT (multi-lingual BERT) on the XNLI dataset. It runs in 106 mins
-on a single tesla V100 16GB. The data for XNLI can be downloaded with the following links and should be both saved (and un-zipped) in a 
-`$XNLI_DIR` directory.
-
-* [XNLI 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-1.0.zip)
-* [XNLI-MT 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-MT-1.0.zip)
-
-```bash
-export XNLI_DIR=/path/to/XNLI
-
-python run_xnli.py \
-  --model_type bert \
-  --model_name_or_path bert-base-multilingual-cased \
-  --language de \
-  --train_language en \
-  --do_train \
-  --do_eval \
-  --data_dir $XNLI_DIR \
-  --per_gpu_train_batch_size 32 \
-  --learning_rate 5e-5 \
-  --num_train_epochs 2.0 \
-  --max_seq_length 128 \
-  --output_dir /tmp/debug_xnli/ \
-  --save_steps -1
-```
-
-Training with the previously defined hyper-parameters yields the following results on the **test** set:
-
-```bash
-acc = 0.7093812375249501
-```
-
-## MM-IMDb
-
-Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/mm-imdb/run_mmimdb.py).
-
-[MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata.
-
-### Training on MM-IMDb
-
-```
-python run_mmimdb.py \
-    --data_dir /path/to/mmimdb/dataset/ \
-    --model_type bert \
-    --model_name_or_path bert-base-uncased \
-    --output_dir /path/to/save/dir/ \
-    --do_train \
-    --do_eval \
-    --max_seq_len 512 \
-    --gradient_accumulation_steps 20 \
-    --num_image_embeds 3 \
-    --num_train_epochs 100 \
-    --patience 5
-```
-
-## Adversarial evaluation of model performances
-
-Here is an example on evaluating a model using adversarial evaluation of natural language inference with the Heuristic Analysis for NLI Systems (HANS) dataset [McCoy et al., 2019](https://arxiv.org/abs/1902.01007). The example was gracefully provided by [Nafise Sadat Moosavi](https://github.com/ns-moosavi).
-
-The HANS dataset can be downloaded from [this location](https://github.com/tommccoy1/hans).
-
-This is an example of using test_hans.py:
-
-```bash
-export HANS_DIR=path-to-hans
-export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc
-export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py
-
-python examples/test_hans.py \
-        --task_name hans \
-        --model_type $MODEL_TYPE \
-        --do_eval \
-        --do_lower_case \
-        --data_dir $HANS_DIR \
-        --model_name_or_path $MODEL_PATH \
-        --max_seq_length 128 \
-        -output_dir $MODEL_PATH \
-```
-
-This will create the hans_predictions.txt file in MODEL_PATH, which can then be evaluated using hans/evaluate_heur_output.py from the HANS dataset.
-
-The results of the BERT-base model that is trained on MNLI using batch size 8 and the random seed 42 on the HANS dataset is as follows:
-
-```bash
-Heuristic entailed results:
-lexical_overlap: 0.9702
-subsequence: 0.9942
-constituent: 0.9962
-
-Heuristic non-entailed results:
-lexical_overlap: 0.199
-subsequence: 0.0396
-constituent: 0.118
-```
+Feedback and more use cases and benchmarks involving TPUs are welcome, please share with the community.
--- a/examples/adversarial/README.md
+++ b/examples/adversarial/README.md
@@ -0,0 +1,38 @@
+## Adversarial evaluation of model performances
+
+Here is an example on evaluating a model using adversarial evaluation of natural language inference with the Heuristic Analysis for NLI Systems (HANS) dataset [McCoy et al., 2019](https://arxiv.org/abs/1902.01007). The example was gracefully provided by [Nafise Sadat Moosavi](https://github.com/ns-moosavi).
+
+The HANS dataset can be downloaded from [this location](https://github.com/tommccoy1/hans).
+
+This is an example of using test_hans.py:
+
+```bash
+export HANS_DIR=path-to-hans
+export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc
+export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py
+
+python examples/hans/test_hans.py \
+        --task_name hans \
+        --model_type $MODEL_TYPE \
+        --do_eval \
+        --data_dir $HANS_DIR \
+        --model_name_or_path $MODEL_PATH \
+        --max_seq_length 128 \
+        --output_dir $MODEL_PATH \
+```
+
+This will create the hans_predictions.txt file in MODEL_PATH, which can then be evaluated using hans/evaluate_heur_output.py from the HANS dataset.
+
+The results of the BERT-base model that is trained on MNLI using batch size 8 and the random seed 42 on the HANS dataset is as follows:
+
+```bash
+Heuristic entailed results:
+lexical_overlap: 0.9702
+subsequence: 0.9942
+constituent: 0.9962
+
+Heuristic non-entailed results:
+lexical_overlap: 0.199
+subsequence: 0.0396
+constituent: 0.118
+```
--- a/examples/adversarial/hans_processors.py
+++ b/examples/adversarial/hans_processors.py
--- a/examples/adversarial/test_hans.py
+++ b/examples/adversarial/test_hans.py
@@ -65,13 +65,6 @@ except ImportError:

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum(
-    (
-        tuple(conf.pretrained_config_archive_map.keys())
-        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
-    ),
-    (),
-)

 MODEL_CLASSES = {
    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
@@ -255,7 +248,7 @@ def evaluate(args, model, tokenizer, prefix=""):
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

        # multi-gpu eval
-        if args.n_gpu > 1:
+        if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
            model = torch.nn.DataParallel(model)

        # Eval!
@@ -342,8 +335,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            max_length=args.max_seq_length,
            output_mode=output_mode,
            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+            pad_token=tokenizer.pad_token_id,
+            pad_token_segment_id=tokenizer.pad_token_type_id,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -389,7 +382,7 @@ def main():
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--task_name",
@@ -520,7 +513,7 @@ def main():
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
--- a/examples/adversarial/utils_hans.py
+++ b/examples/adversarial/utils_hans.py
--- a/examples/benchmarking/plot_csv_file.py
+++ b/examples/benchmarking/plot_csv_file.py
@@ -0,0 +1,113 @@
+import csv
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+
+import matplotlib.pyplot as plt
+from transformers import HfArgumentParser
+
+
+@dataclass
+class PlotArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+
+    csv_file: str = field(metadata={"help": "The csv file to plot."},)
+    plot_along_batch: bool = field(
+        default=False,
+        metadata={"help": "Whether to plot along batch size or sequence lengh. Defaults to sequence length."},
+    )
+    is_time: bool = field(
+        default=False,
+        metadata={"help": "Whether the csv file has time results or memory results. Defaults to memory results."},
+    )
+    is_train: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether the csv file has training results or inference results. Defaults to inference results."
+        },
+    )
+    figure_png_file: Optional[str] = field(
+        default=None, metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
+    )
+
+
+class Plot:
+    def __init__(self, args):
+        self.args = args
+        self.result_dict = defaultdict(lambda: dict(bsz=[], seq_len=[], result={}))
+
+        with open(self.args.csv_file, newline="") as csv_file:
+            reader = csv.DictReader(csv_file)
+            for row in reader:
+                model_name = row["model"]
+                self.result_dict[model_name]["bsz"].append(int(row["batch_size"]))
+                self.result_dict[model_name]["seq_len"].append(int(row["sequence_length"]))
+                self.result_dict[model_name]["result"][(int(row["batch_size"]), int(row["sequence_length"]))] = row[
+                    "result"
+                ]
+
+    def plot(self):
+        fig, ax = plt.subplots()
+        title_str = "Time usage" if self.args.is_time else "Memory usage"
+        title_str = title_str + " for training" if self.args.is_train else title_str + " for inference"
+
+        for model_name in self.result_dict.keys():
+            batch_sizes = sorted(list(set(self.result_dict[model_name]["bsz"])))
+            sequence_lengths = sorted(list(set(self.result_dict[model_name]["seq_len"])))
+            results = self.result_dict[model_name]["result"]
+
+            (x_axis_array, inner_loop_array) = (
+                (batch_sizes, sequence_lengths) if self.args.plot_along_batch else (sequence_lengths, batch_sizes)
+            )
+
+            plt.xlim(min(x_axis_array), max(x_axis_array))
+
+            for inner_loop_value in inner_loop_array:
+                if self.args.plot_along_batch:
+                    y_axis_array = np.asarray([results[(x, inner_loop_value)] for x in x_axis_array], dtype=np.int)
+                else:
+                    y_axis_array = np.asarray([results[(inner_loop_value, x)] for x in x_axis_array], dtype=np.float32)
+
+                ax.set_xscale("log", basex=2)
+                ax.set_yscale("log", basey=10)
+
+                (x_axis_label, inner_loop_label) = (
+                    ("batch_size", "sequence_length in #tokens")
+                    if self.args.plot_along_batch
+                    else ("sequence_length in #tokens", "batch_size")
+                )
+
+                x_axis_array = np.asarray(x_axis_array, np.int)
+                plt.scatter(x_axis_array, y_axis_array, label=f"{model_name} - {inner_loop_label}: {inner_loop_value}")
+                plt.plot(x_axis_array, y_axis_array, "--")
+
+            title_str += f" {model_name} vs."
+
+        title_str = title_str[:-4]
+        y_axis_label = "Time in s" if self.args.is_time else "Memory in MB"
+
+        # plot
+        plt.title(title_str)
+        plt.xlabel(x_axis_label)
+        plt.ylabel(y_axis_label)
+        plt.legend()
+
+        if self.args.figure_png_file is not None:
+            plt.savefig(self.args.figure_png_file)
+        else:
+            plt.show()
+
+
+def main():
+    parser = HfArgumentParser(PlotArguments)
+    plot_args = parser.parse_args_into_dataclasses()[0]
+    plot = Plot(args=plot_args)
+    plot.plot()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/benchmarking/run_benchmark.py
+++ b/examples/benchmarking/run_benchmark.py
@@ -0,0 +1,29 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Benchmarking the library on inference and training """
+
+from transformers import HfArgumentParser, PyTorchBenchmark, PyTorchBenchmarkArguments
+
+
+def main():
+    parser = HfArgumentParser(PyTorchBenchmarkArguments)
+    benchmark_args = parser.parse_args_into_dataclasses()[0]
+    benchmark = PyTorchBenchmark(args=benchmark_args)
+    benchmark.run()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -1,531 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Benchmarking the library on inference and training """
-
-# If checking the tensors placement
-# tf.debugging.set_log_device_placement(True)
-
-import argparse
-import csv
-import timeit
-from time import time
-from typing import List
-
-from transformers import AutoConfig, AutoTokenizer, is_tf_available, is_torch_available
-
-
-if is_tf_available():
-    import tensorflow as tf
-    from transformers import TFAutoModel
-
-if is_torch_available():
-    import torch
-    from transformers import AutoModel
-
-
-input_text = """Bent over their instruments, three hundred Fertilizers were plunged, as
-the Director of Hatcheries and Conditioning entered the room, in the
-
-
-
-scarcely breathing silence, the absent-minded, soliloquizing hum or
-whistle, of absorbed concentration. A troop of newly arrived students,
-very young, pink and callow, followed nervously, rather abjectly, at the
-Director's heels. Each of them carried a notebook, in which, whenever
-the great man spoke, he desperately scribbled. Straight from the
-horse's mouth. It was a rare privilege. The D. H. C. for Central London
-always made a point of personally conducting his new students round
-the various departments.
-
-"Just to give you a general idea," he would explain to them. For of
-course some sort of general idea they must have, if they were to do
-their work intelligently-though as little of one, if they were to be good
-and happy members of society, as possible. For particulars, as every
-one knows, make for virtue and happiness; generalities are intellectu-
-ally necessary evils. Not philosophers but fret-sawyers and stamp col-
-lectors compose the backbone of society.
-
-"To-morrow," he would add, smiling at them with a slightly menacing
-geniality, "you'll be settling down to serious work. You won't have time
-for generalities. Meanwhile ..."
-
-Meanwhile, it was a privilege. Straight from the horse's mouth into the
-notebook. The boys scribbled like mad.
-
-Tall and rather thin but upright, the Director advanced into the room.
-He had a long chin and big rather prominent teeth, just covered, when
-he was not talking, by his full, floridly curved lips. Old, young? Thirty?
-Fifty? Fifty-five? It was hard to say. And anyhow the question didn't
-arise; in this year of stability, A. F. 632, it didn't occur to you to ask it.
-
-"I shall begin at the beginning," said the D.H.C. and the more zealous
-students recorded his intention in their notebooks: Begin at the begin-
-ning. "These," he waved his hand, "are the incubators." And opening
-an insulated door he showed them racks upon racks of numbered test-
-tubes. "The week's supply of ova. Kept," he explained, "at blood heat;
-whereas the male gametes," and here he opened another door, "they
-have to be kept at thirty-five instead of thirty-seven. Full blood heat
-sterilizes." Rams wrapped in theremogene beget no lambs.
-
-Still leaning against the incubators he gave them, while the pencils
-scurried illegibly across the pages, a brief description of the modern
-
-
-
-fertilizing process; spoke first, of course, of its surgical introduc-
-tion-"the operation undergone voluntarily for the good of Society, not
-to mention the fact that it carries a bonus amounting to six months'
-salary"; continued with some account of the technique for preserving
-the excised ovary alive and actively developing; passed on to a consid-
-eration of optimum temperature, salinity, viscosity; referred to the liq-
-uor in which the detached and ripened eggs were kept; and, leading
-his charges to the work tables, actually showed them how this liquor
-was drawn off from the test-tubes; how it was let out drop by drop
-onto the specially warmed slides of the microscopes; how the eggs
-which it contained were inspected for abnormalities, counted and
-transferred to a porous receptacle; how (and he now took them to
-watch the operation) this receptacle was immersed in a warm bouillon
-containing free-swimming spermatozoa-at a minimum concentration
-of one hundred thousand per cubic centimetre, he insisted; and how,
-after ten minutes, the container was lifted out of the liquor and its
-contents re-examined; how, if any of the eggs remained unfertilized, it
-was again immersed, and, if necessary, yet again; how the fertilized
-ova went back to the incubators; where the Alphas and Betas re-
-mained until definitely bottled; while the Gammas, Deltas and Epsilons
-were brought out again, after only thirty-six hours, to undergo Bo-
-kanovsky's Process.
-
-"Bokanovsky's Process," repeated the Director, and the students un-
-derlined the words in their little notebooks.
-
-One egg, one embryo, one adult-normality. But a bokanovskified egg
-will bud, will proliferate, will divide. From eight to ninety-six buds, and
-every bud will grow into a perfectly formed embryo, and every embryo
-into a full-sized adult. Making ninety-six human beings grow where
-only one grew before. Progress.
-
-"Essentially," the D.H.C. concluded, "bokanovskification consists of a
-series of arrests of development. We check the normal growth and,
-paradoxically enough, the egg responds by budding."
-
-Responds by budding. The pencils were busy.
-
-He pointed. On a very slowly moving band a rack-full of test-tubes was
-entering a large metal box, another, rack-full was emerging. Machinery
-faintly purred. It took eight minutes for the tubes to go through, he
-
-
-
-told them. Eight minutes of hard X-rays being about as much as an
-egg can stand. A few died; of the rest, the least susceptible divided
-into two; most put out four buds; some eight; all were returned to the
-incubators, where the buds began to develop; then, after two days,
-were suddenly chilled, chilled and checked. Two, four, eight, the buds
-in their turn budded; and having budded were dosed almost to death
-with alcohol; consequently burgeoned again and having budded-bud
-out of bud out of bud-were thereafter-further arrest being generally
-fatal-left to develop in peace. By which time the original egg was in a
-fair way to becoming anything from eight to ninety-six embryos- a
-prodigious improvement, you will agree, on nature. Identical twins-but
-not in piddling twos and threes as in the old viviparous days, when an
-egg would sometimes accidentally divide; actually by dozens, by
-scores at a time.
-
-"Scores," the Director repeated and flung out his arms, as though he
-were distributing largesse. "Scores."
-
-But one of the students was fool enough to ask where the advantage
-lay.
-
-"My good boy!" The Director wheeled sharply round on him. "Can't you
-see? Can't you see?" He raised a hand; his expression was solemn.
-"Bokanovsky's Process is one of the major instruments of social stabil-
-ity!"
-
-Major instruments of social stability.
-
-Standard men and women; in uniform batches. The whole of a small
-factory staffed with the products of a single bokanovskified egg.
-
-"Ninety-six identical twins working ninety-six identical machines!" The
-voice was almost tremulous with enthusiasm. "You really know where
-you are. For the first time in history." He quoted the planetary motto.
-"Community, Identity, Stability." Grand words. "If we could bo-
-kanovskify indefinitely the whole problem would be solved."
-
-Solved by standard Gammas, unvarying Deltas, uniform Epsilons. Mil-
-lions of identical twins. The principle of mass production at last applied
-to biology.
-
-
-
-"But, alas," the Director shook his head, "we can't bokanovskify indefi-
-nitely."
-
-Ninety-six seemed to be the limit; seventy-two a good average. From
-the same ovary and with gametes of the same male to manufacture as
-many batches of identical twins as possible-that was the best (sadly a
-second best) that they could do. And even that was difficult.
-
-"For in nature it takes thirty years for two hundred eggs to reach ma-
-turity. But our business is to stabilize the population at this moment,
-here and now. Dribbling out twins over a quarter of a century-what
-would be the use of that?"
-
-Obviously, no use at all. But Podsnap's Technique had immensely ac-
-celerated the process of ripening. They could make sure of at least a
-hundred and fifty mature eggs within two years. Fertilize and bo-
-kanovskify-in other words, multiply by seventy-two-and you get an
-average of nearly eleven thousand brothers and sisters in a hundred
-and fifty batches of identical twins, all within two years of the same
-age.
-
-"And in exceptional cases we can make one ovary yield us over fifteen
-thousand adult individuals."
-
-Beckoning to a fair-haired, ruddy young man who happened to be
-passing at the moment. "Mr. Foster," he called. The ruddy young man
-approached. "Can you tell us the record for a single ovary, Mr. Foster?"
-
-"Sixteen thousand and twelve in this Centre," Mr. Foster replied with-
-out hesitation. He spoke very quickly, had a vivacious blue eye, and
-took an evident pleasure in quoting figures. "Sixteen thousand and
-twelve; in one hundred and eighty-nine batches of identicals. But of
-course they've done much better," he rattled on, "in some of the tropi-
-cal Centres. Singapore has often produced over sixteen thousand five
-hundred; and Mombasa has actually touched the seventeen thousand
-mark. But then they have unfair advantages. You should see the way a
-negro ovary responds to pituitary! It's quite astonishing, when you're
-used to working with European material. Still," he added, with a laugh
-(but the light of combat was in his eyes and the lift of his chin was
-challenging), "still, we mean to beat them if we can. I'm working on a
-wonderful Delta-Minus ovary at this moment. Only just eighteen
-
-
-
-months old. Over twelve thousand seven hundred children already, ei-
-ther decanted or in embryo. And still going strong. We'll beat them
-yet."
-
-"That's the spirit I like!" cried the Director, and clapped Mr. Foster on
-the shoulder. "Come along with us, and give these boys the benefit of
-your expert knowledge."
-
-Mr. Foster smiled modestly. "With pleasure." They went.
-In the Bottling Room all was harmonious bustle and ordered activity.
-Flaps of fresh sow's peritoneum ready cut to the proper size came
-shooting up in little lifts from the Organ Store in the sub-basement.
-Whizz and then, click! the lift-hatches hew open; the bottle-liner had
-only to reach out a hand, take the flap, insert, smooth-down, and be-
-fore the lined bottle had had time to travel out of reach along the end-
-less band, whizz, click! another flap of peritoneum had shot up from
-the depths, ready to be slipped into yet another bottle, the next of that
-slow interminable procession on the band.
-
-Next to the Liners stood the Matriculators. The procession advanced;
-one by one the eggs were transferred from their test-tubes to the
-larger containers; deftly the peritoneal lining was slit, the morula
-dropped into place, the saline solution poured in ... and already the
-bottle had passed, and it was the turn of the labellers. Heredity, date
-of fertilization, membership of Bokanovsky Group-details were trans-
-ferred from test-tube to bottle. No longer anonymous, but named,
-identified, the procession marched slowly on; on through an opening in
-the wall, slowly on into the Social Predestination Room.
-"Eighty-eight cubic metres of card-index," said Mr. Foster with relish,
-as they entered."""
-
-
-def create_setup_and_compute(
-    model_names: List[str],
-    gpu: bool = True,
-    tensorflow: bool = False,
-    average_over: int = 3,
-    torchscript: bool = False,
-    xla: bool = False,
-    amp: bool = False,
-    fp16: bool = False,
-    save_to_csv: bool = False,
-    csv_filename: str = f"results_{round(time())}.csv",
-):
-    if xla:
-        tf.config.optimizer.set_jit(True)
-    if amp:
-        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
-
-    if tensorflow:
-        dictionary = {model_name: {} for model_name in model_names}
-        results = _compute_tensorflow(model_names, dictionary, average_over, amp)
-    else:
-        device = "cuda" if (gpu and torch.cuda.is_available()) else "cpu"
-        dictionary = {model_name: {} for model_name in model_names}
-        results = _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16)
-
-    print("=========== RESULTS ===========")
-    for model_name in model_names:
-        print("\t" + f"======= MODEL CHECKPOINT: {model_name} =======")
-        for batch_size in results[model_name]["bs"]:
-            print("\t\t" + f"===== BATCH SIZE: {batch_size} =====")
-            for slice_size in results[model_name]["ss"]:
-                result = results[model_name]["results"][batch_size][slice_size]
-                if isinstance(result, str):
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result}")
-                else:
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{(round(1000 * result) / 1000)}" f"s")
-
-    if save_to_csv:
-        with open(csv_filename, mode="w") as csv_file:
-            fieldnames = [
-                "model",
-                "1x8",
-                "1x64",
-                "1x128",
-                "1x256",
-                "1x512",
-                "1x1024",
-                "2x8",
-                "2x64",
-                "2x128",
-                "2x256",
-                "2x512",
-                "2x1024",
-                "4x8",
-                "4x64",
-                "4x128",
-                "4x256",
-                "4x512",
-                "4x1024",
-                "8x8",
-                "8x64",
-                "8x128",
-                "8x256",
-                "8x512",
-                "8x1024",
-            ]
-
-            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
-            writer.writeheader()
-
-            for model_name in model_names:
-                model_results = {
-                    f"{bs}x{ss}": results[model_name]["results"][bs][ss]
-                    for bs in results[model_name]["results"]
-                    for ss in results[model_name]["results"][bs]
-                }
-                writer.writerow({"model": model_name, **model_results})
-
-
-def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16):
-    for c, model_name in enumerate(model_names):
-        print(f"{c + 1} / {len(model_names)}")
-        config = AutoConfig.from_pretrained(model_name, torchscript=torchscript)
-        model = AutoModel.from_pretrained(model_name, config=config)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
-
-        max_input_size = tokenizer.max_model_input_sizes[model_name]
-        batch_sizes = [1, 2, 4, 8]
-        slice_sizes = [8, 64, 128, 256, 512, 1024]
-
-        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
-        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
-
-        for batch_size in batch_sizes:
-            if fp16:
-                model.half()
-            model.to(device)
-            model.eval()
-            for slice_size in slice_sizes:
-                if max_input_size is not None and slice_size > max_input_size:
-                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-                else:
-                    sequence = torch.tensor(tokenized_sequence[:slice_size], device=device).repeat(batch_size, 1)
-                    try:
-                        if torchscript:
-                            print("Tracing model with sequence size", sequence.shape)
-                            inference = torch.jit.trace(model, sequence)
-                            inference(sequence)
-                        else:
-                            inference = model
-                            inference(sequence)
-
-                        print("Going through model with sequence of shape", sequence.shape)
-                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
-                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
-                    except RuntimeError as e:
-                        print("Doesn't fit on GPU.", e)
-                        torch.cuda.empty_cache()
-                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-    return dictionary
-
-
-def _compute_tensorflow(model_names, dictionary, average_over, amp):
-    for c, model_name in enumerate(model_names):
-        print(f"{c + 1} / {len(model_names)}")
-        config = AutoConfig.from_pretrained(model_name)
-        model = TFAutoModel.from_pretrained(model_name, config=config)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
-
-        max_input_size = tokenizer.max_model_input_sizes[model_name]
-        batch_sizes = [1, 2, 4, 8]
-        slice_sizes = [8, 64, 128, 256, 512, 1024]
-
-        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
-        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
-
-        print("Using model", model)
-
-        @tf.function
-        def inference(inputs):
-            return model(inputs)
-
-        for batch_size in batch_sizes:
-            for slice_size in slice_sizes:
-                if max_input_size is not None and slice_size > max_input_size:
-                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-                else:
-                    sequence = tf.stack(
-                        [tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size
-                    )
-
-                    try:
-                        print("Going through model with sequence of shape", sequence.shape)
-                        # To make sure that the model is traced + that the tensors are on the appropriate device
-                        inference(sequence)
-
-                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
-                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
-                    except tf.errors.ResourceExhaustedError as e:
-                        print("Doesn't fit on GPU.", e)
-                        torch.cuda.empty_cache()
-                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-    return dictionary
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--models",
-        required=False,
-        type=str,
-        default="all",
-        help="Model checkpoints to be provided "
-        "to the AutoModel classes. Leave "
-        "blank to benchmark the base version "
-        "of all available model "
-        "architectures.",
-    )
-    parser.add_argument(
-        "--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the " "models"
-    )
-    parser.add_argument(
-        "--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available " "cuda devices"
-    )
-    parser.add_argument(
-        "--torchscript",
-        required=False,
-        action="store_true",
-        help="Pytorch only: trace the models " "using torchscript",
-    )
-    parser.add_argument(
-        "--tensorflow",
-        required=False,
-        action="store_true",
-        help="Benchmark the TensorFlow version "
-        "of the models. Will run on GPU if "
-        "the correct dependencies are "
-        "installed",
-    )
-    parser.add_argument("--xla", required=False, action="store_true", help="TensorFlow only: use XLA acceleration.")
-    parser.add_argument(
-        "--amp",
-        required=False,
-        action="store_true",
-        help="TensorFlow only: use automatic mixed precision acceleration.",
-    )
-    parser.add_argument(
-        "--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference."
-    )
-    parser.add_argument(
-        "--keras_predict",
-        required=False,
-        action="store_true",
-        help="Whether to use model.predict " "instead of model() to do a " "forward pass.",
-    )
-    parser.add_argument("--save_to_csv", required=False, action="store_true", help="Save to a CSV file.")
-    parser.add_argument(
-        "--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv."
-    )
-    parser.add_argument(
-        "--average_over", required=False, default=30, type=int, help="Times an experiment will be run."
-    )
-
-    args = parser.parse_args()
-    if args.models == "all":
-        args.models = [
-            "gpt2",
-            "bert-base-cased",
-            "xlnet-base-cased",
-            "xlm-mlm-en-2048",
-            "transfo-xl-wt103",
-            "openai-gpt",
-            "distilbert-base-uncased",
-            "distilgpt2",
-            "roberta-base",
-            "ctrl",
-        ]
-    else:
-        args.models = args.models.split()
-
-    print("Running with arguments", args)
-
-    if args.torch:
-        if is_torch_available():
-            create_setup_and_compute(
-                model_names=args.models,
-                tensorflow=False,
-                gpu=args.torch_cuda,
-                torchscript=args.torchscript,
-                fp16=args.fp16,
-                save_to_csv=args.save_to_csv,
-                csv_filename=args.csv_filename,
-                average_over=args.average_over,
-            )
-        else:
-            raise ImportError("Trying to run a PyTorch benchmark but PyTorch was not found in the environment.")
-
-    if args.tensorflow:
-        if is_tf_available():
-            create_setup_and_compute(
-                model_names=args.models,
-                tensorflow=True,
-                xla=args.xla,
-                amp=args.amp,
-                save_to_csv=args.save_to_csv,
-                csv_filename=args.csv_filename,
-                average_over=args.average_over,
-            )
-        else:
-            raise ImportError("Trying to run a TensorFlow benchmark but TensorFlow was not found in the environment.")
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/bertology/run_bertology.py
+++ b/examples/bertology/run_bertology.py
@@ -30,10 +30,17 @@ from torch.utils.data import DataLoader, SequentialSampler, Subset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm

-from run_glue import ALL_MODELS, MODEL_CLASSES, load_and_cache_examples, set_seed
-from transformers import glue_compute_metrics as compute_metrics
-from transformers import glue_output_modes as output_modes
-from transformers import glue_processors as processors
+from transformers import (
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    DefaultDataCollator,
+    GlueDataset,
+    glue_compute_metrics,
+    glue_output_modes,
+    glue_processors,
+    set_seed,
+)


 logger = logging.getLogger(__name__)
@@ -57,32 +64,35 @@ def print_2d_tensor(tensor):


 def compute_heads_importance(
-    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None
+    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None, actually_pruned=False
 ):
    """ This method shows how to compute:
        - head attention entropy
        - head importance scores according to http://arxiv.org/abs/1905.10650
    """
    # Prepare our tensors
-    n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads
+    n_layers, n_heads = model.config.num_hidden_layers, model.config.num_attention_heads
    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)

    if head_mask is None:
        head_mask = torch.ones(n_layers, n_heads).to(args.device)
+
    head_mask.requires_grad_(requires_grad=True)
+    # If actually pruned attention multi-head, set head mask to None to avoid shape mismatch
+    if actually_pruned:
+        head_mask = None
+
    preds = None
    labels = None
    tot_tokens = 0.0

-    for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-        batch = tuple(t.to(args.device) for t in batch)
-        input_ids, input_mask, segment_ids, label_ids = batch
+    for step, inputs in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+        for k, v in inputs.items():
+            inputs[k] = v.to(args.device)

        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
-        outputs = model(
-            input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask
-        )
+        outputs = model(**inputs, head_mask=head_mask)
        loss, logits, all_attentions = (
            outputs[0],
            outputs[1],
@@ -92,7 +102,7 @@ def compute_heads_importance(

        if compute_entropy:
            for layer, attn in enumerate(all_attentions):
-                masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1)
+                masked_entropy = entropy(attn.detach()) * inputs["attention_mask"].float().unsqueeze(1)
                attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()

        if compute_importance:
@@ -101,12 +111,12 @@ def compute_heads_importance(
        # Also store our logits/labels if we want to compute metrics afterwards
        if preds is None:
            preds = logits.detach().cpu().numpy()
-            labels = label_ids.detach().cpu().numpy()
+            labels = inputs["labels"].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-            labels = np.append(labels, label_ids.detach().cpu().numpy(), axis=0)
+            labels = np.append(labels, inputs["labels"].detach().cpu().numpy(), axis=0)

-        tot_tokens += input_mask.float().detach().sum().data
+        tot_tokens += inputs["attention_mask"].float().detach().sum().data

    # Normalize
    attn_entropy /= tot_tokens
@@ -145,7 +155,7 @@ def mask_heads(args, model, eval_dataloader):
    """
    _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-    original_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    original_score = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
    logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)

    new_head_mask = torch.ones_like(head_importance)
@@ -167,6 +177,7 @@ def mask_heads(args, model, eval_dataloader):
        new_head_mask = new_head_mask.view(-1)
        new_head_mask[current_heads_to_mask] = 0.0
        new_head_mask = new_head_mask.view_as(head_mask)
+        new_head_mask = new_head_mask.clone().detach()
        print_2d_tensor(new_head_mask)

        # Compute metric and head importance again
@@ -174,9 +185,9 @@ def mask_heads(args, model, eval_dataloader):
            args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask
        )
        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-        current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+        current_score = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
        logger.info(
-            "Masking: current score: %f, remaning heads %d (%.1f percents)",
+            "Masking: current score: %f, remaining heads %d (%.1f percents)",
            current_score,
            new_head_mask.sum(),
            new_head_mask.sum() / new_head_mask.numel() * 100,
@@ -200,21 +211,30 @@ def prune_heads(args, model, eval_dataloader, head_mask):
        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask
    )
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-    score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    score_masking = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
    original_time = datetime.now() - before_time

    original_num_params = sum(p.numel() for p in model.parameters())
-    heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask)))
+    heads_to_prune = dict(
+        (layer, (1 - head_mask[layer].long()).nonzero().squeeze().tolist()) for layer in range(len(head_mask))
+    )
+
    assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
    model.prune_heads(heads_to_prune)
    pruned_num_params = sum(p.numel() for p in model.parameters())

    before_time = datetime.now()
    _, _, preds, labels = compute_heads_importance(
-        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=None
+        args,
+        model,
+        eval_dataloader,
+        compute_entropy=False,
+        compute_importance=False,
+        head_mask=None,
+        actually_pruned=True,
    )
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-    score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    score_pruning = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
    new_time = datetime.now() - before_time

    logger.info(
@@ -242,14 +262,14 @@ def main():
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
-        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+        help="The name of the task to train selected in the list: " + ", ".join(glue_processors.keys()),
    )
    parser.add_argument(
        "--output_dir",
@@ -274,7 +294,7 @@ def main():
    )
    parser.add_argument(
        "--cache_dir",
-        default="",
+        default=None,
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )
@@ -338,7 +358,7 @@ def main():
    # Setup devices and distributed training
    if args.local_rank == -1 or args.no_cuda:
        args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
@@ -350,48 +370,40 @@ def main():
    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1)))

    # Set seeds
-    set_seed(args)
+    set_seed(args.seed)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
-    if args.task_name not in processors:
+    if args.task_name not in glue_processors:
        raise ValueError("Task not found: %s" % (args.task_name))
-    processor = processors[args.task_name]()
-    args.output_mode = output_modes[args.task_name]
+    processor = glue_processors[args.task_name]()
+    args.output_mode = glue_output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.

-    args.model_type = ""
-    for key in MODEL_CLASSES:
-        if key in args.model_name_or_path.lower():
-            args.model_type = key  # take the first match in model types
-            break
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
+    config = AutoConfig.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        output_attentions=True,
-        cache_dir=args.cache_dir if args.cache_dir else None,
+        cache_dir=args.cache_dir,
    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        cache_dir=args.cache_dir if args.cache_dir else None,
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir,
    )
-    model = model_class.from_pretrained(
+    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
+        cache_dir=args.cache_dir,
    )

-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
    # Distributed and parallel training
    model.to(args.device)
    if args.local_rank != -1:
@@ -402,15 +414,18 @@ def main():
        model = torch.nn.DataParallel(model)

    # Print/save training arguments
+    os.makedirs(args.output_dir, exist_ok=True)
    torch.save(args, os.path.join(args.output_dir, "run_args.bin"))
    logger.info("Training/evaluation parameters %s", args)

    # Prepare dataset for the GLUE task
-    eval_data = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)
+    eval_dataset = GlueDataset(args, tokenizer=tokenizer, mode="dev")
    if args.data_subset > 0:
-        eval_data = Subset(eval_data, list(range(min(args.data_subset, len(eval_data)))))
-    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
-    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
+        eval_dataset = Subset(eval_dataset, list(range(min(args.data_subset, len(eval_dataset)))))
+    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+    eval_dataloader = DataLoader(
+        eval_dataset, sampler=eval_sampler, batch_size=args.batch_size, collate_fn=DefaultDataCollator().collate_batch
+    )

    # Compute head entropy and importance score
    compute_heads_importance(args, model, eval_dataloader)
--- a/examples/contrib/mm-imdb/README.md
+++ b/examples/contrib/mm-imdb/README.md
@@ -0,0 +1,23 @@
+## MM-IMDb
+
+Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/contrib/mm-imdb/run_mmimdb.py).
+
+[MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata.
+
+### Training on MM-IMDb
+
+```
+python run_mmimdb.py \
+    --data_dir /path/to/mmimdb/dataset/ \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --output_dir /path/to/save/dir/ \
+    --do_train \
+    --do_eval \
+    --max_seq_len 512 \
+    --gradient_accumulation_steps 20 \
+    --num_image_embeds 3 \
+    --num_train_epochs 100 \
+    --patience 5
+```
+
--- a/examples/contrib/mm-imdb/run_mmimdb.py
+++ b/examples/contrib/mm-imdb/run_mmimdb.py
@@ -34,26 +34,11 @@ from tqdm import tqdm, trange
 from transformers import (
    WEIGHTS_NAME,
    AdamW,
-    AlbertConfig,
-    AlbertModel,
-    AlbertTokenizer,
-    BertConfig,
-    BertModel,
-    BertTokenizer,
-    DistilBertConfig,
-    DistilBertModel,
-    DistilBertTokenizer,
+    AutoConfig,
+    AutoModel,
+    AutoTokenizer,
    MMBTConfig,
    MMBTForClassification,
-    RobertaConfig,
-    RobertaModel,
-    RobertaTokenizer,
-    XLMConfig,
-    XLMModel,
-    XLMTokenizer,
-    XLNetConfig,
-    XLNetModel,
-    XLNetTokenizer,
    get_linear_schedule_with_warmup,
 )
 from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_transforms, get_mmimdb_labels
@@ -67,23 +52,6 @@ except ImportError:

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum(
-    (
-        tuple(conf.pretrained_config_archive_map.keys())
-        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
-    ),
-    (),
-)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertModel, BertTokenizer),
-    "xlnet": (XLNetConfig, XLNetModel, XLNetTokenizer),
-    "xlm": (XLMConfig, XLMModel, XLMTokenizer),
-    "roberta": (RobertaConfig, RobertaModel, RobertaTokenizer),
-    "distilbert": (DistilBertConfig, DistilBertModel, DistilBertTokenizer),
-    "albert": (AlbertConfig, AlbertModel, AlbertTokenizer),
-}
-

 def set_seed(args):
    random.seed(args.seed)
@@ -278,7 +246,7 @@ def evaluate(args, model, tokenizer, criterion, prefix=""):
    )

    # multi-gpu eval
-    if args.n_gpu > 1:
+    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
@@ -351,19 +319,12 @@ def main():
        required=True,
        help="The input data dir. Should contain the .jsonl files for MMIMDB.",
    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--output_dir",
@@ -385,7 +346,7 @@ def main():
    )
    parser.add_argument(
        "--cache_dir",
-        default="",
+        default=None,
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )
@@ -492,7 +453,7 @@ def main():
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
@@ -526,18 +487,14 @@ def main():
    # Setup model
    labels = get_mmimdb_labels()
    num_labels = len(labels)
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    transformer_config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path
-    )
-    tokenizer = tokenizer_class.from_pretrained(
+    transformer_config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
+        cache_dir=args.cache_dir,
    )
-    transformer = model_class.from_pretrained(
-        args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir if args.cache_dir else None
+    transformer = AutoModel.from_pretrained(
+        args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir
    )
    img_encoder = ImageEncoder(args)
    config = MMBTConfig(transformer_config, num_labels=num_labels)
@@ -583,13 +540,12 @@ def main():
        # Load a trained model and vocabulary that you have fine-tuned
        model = MMBTForClassification(config, transformer, img_encoder)
        model.load_state_dict(torch.load(os.path.join(args.output_dir, WEIGHTS_NAME)))
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
--- a/examples/contrib/mm-imdb/utils_mmimdb.py
+++ b/examples/contrib/mm-imdb/utils_mmimdb.py
--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@@ -249,8 +249,8 @@ def main():
                losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
-                scheduler.step()
                optimizer.step()
+                scheduler.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = (
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -31,14 +31,8 @@ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Tenso
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange

-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    BertConfig,
-    BertForMultipleChoice,
-    BertTokenizer,
-    get_linear_schedule_with_warmup,
-)
+from transformers import WEIGHTS_NAME, AdamW, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup
+from transformers.modeling_auto import AutoModelForMultipleChoice


 try:
@@ -49,12 +43,6 @@ except ImportError:

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in [BertConfig]), ())
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
-}
-

 class SwagExample(object):
    """A single training/test example for the SWAG dataset."""
@@ -492,19 +480,12 @@ def main():
        required=True,
        help="SWAG csv for predictions. E.g., val.csv or test.csv",
    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--output_dir",
@@ -536,9 +517,6 @@ def main():
    parser.add_argument(
        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
@@ -622,7 +600,7 @@ def main():
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
@@ -652,13 +630,9 @@ def main():
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case
-    )
-    model = model_class.from_pretrained(
+    config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,)
+    model = AutoModelForMultipleChoice.from_pretrained(
        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
    )

@@ -694,8 +668,8 @@ def main():
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model = AutoModelForMultipleChoice.from_pretrained(args.output_dir)
+        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
@@ -718,8 +692,8 @@ def main():
        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            model = model_class.from_pretrained(checkpoint)
-            tokenizer = tokenizer_class.from_pretrained(checkpoint)
+            model = AutoModelForMultipleChoice.from_pretrained(checkpoint)
+            tokenizer = AutoTokenizer.from_pretrained(checkpoint)
            model.to(args.device)

            # Evaluate
--- a/examples/contrib/run_transfo_xl.py
+++ b/examples/contrib/run_transfo_xl.py
@@ -80,7 +80,7 @@ def main():

    # Load a pre-trained model
    model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
-    model = model.to(device)
+    model.to(device)

    logger.info(
        "Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}".format(
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -10,14 +10,14 @@ This folder contains the original code used to train Distil* as well as examples

 **October 23, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller.

-**October 3, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
+**October 3, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper supersedes our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**

 **September 19, 2019 - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 99% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!


 ## What is Distil*

-Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 99% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
+Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.

 We have applied the same method to other Transformer architectures and released the weights:
 - GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 16.3 compared to 21.1 for **DistilGPT2** (after fine-tuning on the train set).
@@ -31,13 +31,15 @@ Here are the results on the dev sets of GLUE:

 | Model                     | Macro-score                    | CoLA | MNLI | MRPC | QNLI | QQP  | RTE  | SST-2| STS-B| WNLI              |
 | :---:                     |    :---:                       | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:             |
-| BERT-base-uncased         |  **77.6**                      | 49.2 | 80.8 | 87.4 | 87.5 | 86.4 | 61.7 | 92.0 | 83.8 | 45.1              |
-| DistilBERT-base-uncased   |  **76.8**                      | 43.6 | 79.0 | 87.5 | 85.3 | 84.9 | 59.9 | 90.7 | 81.2 | 56.3              |
+| BERT-base-uncased         |  **79.5**                      | 56.3 | 84.7 | 88.6 | 91.8 | 89.6 | 69.3 | 92.7 | 89.0 | 53.5              |
+| DistilBERT-base-uncased   |  **77.0**                      | 51.3 | 82.1 | 87.5 | 89.2 | 88.5 | 59.9 | 91.3 | 86.9 | 56.3              |
+| BERT-base-cased           |  **78.2**                      | 58.2 | 83.9 | 87.8 | 91.0 | 89.2 | 66.1 | 91.7 | 89.2 | 46.5              |
+| DistilBERT-base-cased     |  **75.9**                      | 47.2 | 81.5 | 85.6 | 88.2 | 87.8 | 60.6 | 90.4 | 85.5 | 56.3              |
 | ---                       |    ---                         |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  ---              |
 | RoBERTa-base (reported)   |  **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup>  |
 | DistilRoBERTa<sup>1</sup> |  **79.0**/**82.3**<sup>2</sup> | 59.3 | 84.0 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1              |

-<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa.
+<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directly perform transfer learning on the pre-trained DistilRoBERTa.

 <sup>2</sup> Macro-score computed without WNLI.

@@ -63,7 +65,9 @@ This part of the library has only be tested with Python3.6+. There are few speci
 Transformers includes five pre-trained Distil* models, currently only provided for English and German (we are investigating the possibility to train and release a multilingual version of DistilBERT):

 - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
+- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knowledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
+- `distilbert-base-cased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-cased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 65M parameters.
+- `distilbert-base-cased-distilled-squad`: A finetuned version of `distilbert-base-cased` finetuned using (a second step of) knowledge distillation on SQuAD 1.0. This model reaches a F1 score of 87.1 on the dev set (for comparison, Bert `bert-base-cased` version reaches a 88.7 F1 score).
 - `distilbert-base-german-cased`: DistilBERT German language model pretrained on 1/2 of the data used to pretrain Bert using distillation with the supervision of the `bert-base-german-dbmdz-cased` version of German DBMDZ Bert. For NER tasks the model reaches a F1 score of 83.49 on the CoNLL-2003 test set (for comparison, `bert-base-german-dbmdz-cased` reaches a 84.52 F1 score), and a F1 score of 85.23 on the GermEval 2014 test set (`bert-base-german-dbmdz-cased` reaches a 86.89 F1 score).
 - `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
 - `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.
@@ -72,8 +76,8 @@ Transformers includes five pre-trained Distil* models, currently only provided f
 Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.

 ```python
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-model = DistilBertModel.from_pretrained('distilbert-base-uncased')
+tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
+model = DistilBertModel.from_pretrained('distilbert-base-cased')

 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)
 outputs = model(input_ids)
@@ -81,6 +85,7 @@ last_hidden_states = outputs[0]  # The last hidden-state is the first element of
 ```

 Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint:
+- DistilBERT uncased: `model = DistilBertModel.from_pretrained('distilbert-base-uncased')`
 - DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')`
 - DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')`
 - DistilmBERT: `model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')`
@@ -106,7 +111,7 @@ python scripts/binarized_data.py \
    --dump_file data/binarized_text
 ```

-Our implementation of masked language modeling loss follows [XLM](https://github.com/facebookresearch/XLM)'s one and smoothes the probability of masking with a factor that put more emphasis on rare words. Thus we count the occurences of each tokens in the data:
+Our implementation of masked language modeling loss follows [XLM](https://github.com/facebookresearch/XLM)'s one and smoothes the probability of masking with a factor that put more emphasis on rare words. Thus we count the occurrences of each tokens in the data:

 ```bash
 python scripts/token_counts.py \
@@ -174,7 +179,7 @@ Happy distillation!

 ## Citation

-If you find the ressource useful, you should cite the following paper:
+If you find the resource useful, you should cite the following paper:

 ```
@inproceedings{sanh2019distilbert,
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -80,7 +80,7 @@ class Distiller:

        self.mlm = params.mlm
        if self.mlm:
-            logger.info(f"Using MLM loss for LM step.")
+            logger.info("Using MLM loss for LM step.")
            self.mlm_mask_prop = params.mlm_mask_prop
            assert 0.0 <= self.mlm_mask_prop <= 1.0
            assert params.word_mask + params.word_keep + params.word_rand == 1.0
@@ -91,7 +91,7 @@ class Distiller:
                self.pred_probs = self.pred_probs.half()
                self.token_probs = self.token_probs.half()
        else:
-            logger.info(f"Using CLM loss for LM step.")
+            logger.info("Using CLM loss for LM step.")

        self.epoch = 0
        self.n_iter = 0
@@ -365,8 +365,8 @@ class Distiller:
            self.end_epoch()

        if self.is_master:
-            logger.info(f"Save very last checkpoint as `pytorch_model.bin`.")
-            self.save_checkpoint(checkpoint_name=f"pytorch_model.bin")
+            logger.info("Save very last checkpoint as `pytorch_model.bin`.")
+            self.save_checkpoint(checkpoint_name="pytorch_model.bin")
            logger.info("Training is finished")

    def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor):
--- a/examples/distillation/requirements.txt
+++ b/examples/distillation/requirements.txt
@@ -3,5 +3,5 @@ transformers
 gitpython==3.0.2
 tensorboard>=1.14.0
 tensorboardX==1.8
-psutil==5.6.3
+psutil==5.6.6
 scipy==1.3.1
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" This is the exact same script as `examples/run_squad.py` (as of 2020, January 8th) with an additional and optional step of distillation."""
+""" This is the exact same script as `examples/question-answering/run_squad.py` (as of 2020, January 8th) with an additional and optional step of distillation."""

 import argparse
 import glob
@@ -39,6 +39,9 @@ from transformers import (
    DistilBertConfig,
    DistilBertForQuestionAnswering,
    DistilBertTokenizer,
+    RobertaConfig,
+    RobertaForQuestionAnswering,
+    RobertaTokenizer,
    XLMConfig,
    XLMForQuestionAnswering,
    XLMTokenizer,
@@ -64,15 +67,13 @@ except ImportError:

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ()
-)

 MODEL_CLASSES = {
    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
    "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
    "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
    "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
+    "roberta": (RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer),
 }


@@ -501,7 +502,7 @@ def main():
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--output_dir",
@@ -716,7 +717,7 @@ def main():
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -60,7 +60,7 @@ def main():
    with open(args.file_path, "r", encoding="utf8") as fp:
        data = fp.readlines()

-    logger.info(f"Start encoding")
+    logger.info("Start encoding")
    logger.info(f"{len(data)} examples to process.")

    rslt = []
@@ -75,13 +75,17 @@ def main():
        iter += 1
        if iter % interval == 0:
            end = time.time()
-            logger.info(f"{iter} examples processed. - {(end-start)/interval:.2f}s/expl")
+            logger.info(f"{iter} examples processed. - {(end-start):.2f}s/{interval}expl")
            start = time.time()
    logger.info("Finished binarization")
    logger.info(f"{len(data)} examples processed.")

    dp_file = f"{args.dump_file}.{args.tokenizer_name}.pickle"
-    rslt_ = [np.uint16(d) for d in rslt]
+    vocab_size = tokenizer.vocab_size
+    if vocab_size < (1 << 16):
+        rslt_ = [np.uint16(d) for d in rslt]
+    else:
+        rslt_ = [np.int32(d) for d in rslt]
    random.shuffle(rslt_)
    logger.info(f"Dump to {dp_file}")
    with open(dp_file, "wb") as handle:
--- a/examples/distillation/scripts/extract.py
+++ b/examples/distillation/scripts/extract.py
@@ -93,7 +93,7 @@ if __name__ == "__main__":
    elif args.model_type == "gpt2":
        for w in ["weight", "bias"]:
            compressed_sd[f"{prefix}.ln_f.{w}"] = state_dict[f"{prefix}.ln_f.{w}"]
-        compressed_sd[f"lm_head.weight"] = state_dict[f"lm_head.weight"]
+        compressed_sd["lm_head.weight"] = state_dict["lm_head.weight"]

    print(f"N layers selected for distillation: {std_idx}")
    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
--- a/examples/distillation/scripts/extract_distilbert.py
+++ b/examples/distillation/scripts/extract_distilbert.py
@@ -37,7 +37,7 @@ if __name__ == "__main__":
        model = BertForMaskedLM.from_pretrained(args.model_name)
        prefix = "bert"
    else:
-        raise ValueError(f'args.model_type should be "bert".')
+        raise ValueError('args.model_type should be "bert".')

    state_dict = model.state_dict()
    compressed_sd = {}
@@ -78,8 +78,8 @@ if __name__ == "__main__":
            ]
        std_idx += 1

-    compressed_sd[f"vocab_projector.weight"] = state_dict[f"cls.predictions.decoder.weight"]
-    compressed_sd[f"vocab_projector.bias"] = state_dict[f"cls.predictions.bias"]
+    compressed_sd["vocab_projector.weight"] = state_dict["cls.predictions.decoder.weight"]
+    compressed_sd["vocab_projector.bias"] = state_dict["cls.predictions.bias"]
    if args.vocab_transform:
        for w in ["weight", "bias"]:
            compressed_sd[f"vocab_transform.{w}"] = state_dict[f"cls.predictions.transform.dense.{w}"]
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@@ -273,7 +273,7 @@ def main():
        token_probs = None

    train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
-    logger.info(f"Data loader created.")
+    logger.info("Data loader created.")

    # STUDENT #
    logger.info(f"Loading student config from {args.student_config}")
@@ -288,7 +288,7 @@ def main():

    if args.n_gpu > 0:
        student.to(f"cuda:{args.local_rank}")
-    logger.info(f"Student loaded.")
+    logger.info("Student loaded.")

    # TEACHER #
    teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
--- a/examples/distillation/training_configs/distilbert-base-cased.json
+++ b/examples/distillation/training_configs/distilbert-base-cased.json
@@ -0,0 +1,15 @@
+{
+	"activation": "gelu",
+	"attention_dropout": 0.1,
+	"dim": 768,
+	"dropout": 0.1,
+	"hidden_dim": 3072,
+	"initializer_range": 0.02,
+	"max_position_embeddings": 512,
+	"n_heads": 12,
+	"n_layers": 6,
+	"sinusoidal_pos_embds": true,
+	"tie_weights_": true,
+	"vocab_size": 28996
+  }
+  
--- a/examples/language-modeling/README.md
+++ b/examples/language-modeling/README.md
@@ -0,0 +1,62 @@
+
+## Language model training
+
+Based on the script [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py).
+
+Fine-tuning (or training from scratch) the library models for language modeling on a text dataset for GPT, GPT-2, BERT, DistilBERT and RoBERTa. GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT, DistilBERT and RoBERTa
+are fine-tuned using a masked language modeling (MLM) loss.
+
+Before running the following example, you should get a file that contains text on which the language model will be
+trained or fine-tuned. A good example of such text is the [WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
+
+We will refer to two different files: `$TRAIN_FILE`, which contains text for training, and `$TEST_FILE`, which contains
+text that will be used for evaluation.
+
+### GPT-2/GPT and causal language modeling
+
+The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before
+the tokenization). The loss here is that of causal language modeling.
+
+```bash
+export TRAIN_FILE=/path/to/dataset/wiki.train.raw
+export TEST_FILE=/path/to/dataset/wiki.test.raw
+
+python run_language_modeling.py \
+    --output_dir=output \
+    --model_type=gpt2 \
+    --model_name_or_path=gpt2 \
+    --do_train \
+    --train_data_file=$TRAIN_FILE \
+    --do_eval \
+    --eval_data_file=$TEST_FILE
+```
+
+This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run. It reaches
+a score of ~20 perplexity once fine-tuned on the dataset.
+
+### RoBERTa/BERT/DistilBERT and masked language modeling
+
+The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different
+as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their
+pre-training: masked language modeling.
+
+In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may, therefore, converge
+slightly slower (over-fitting takes more epochs).
+
+We use the `--mlm` flag so that the script may change its loss function.
+
+```bash
+export TRAIN_FILE=/path/to/dataset/wiki.train.raw
+export TEST_FILE=/path/to/dataset/wiki.test.raw
+
+python run_language_modeling.py \
+    --output_dir=output \
+    --model_type=roberta \
+    --model_name_or_path=roberta-base \
+    --do_train \
+    --train_data_file=$TRAIN_FILE \
+    --do_eval \
+    --eval_data_file=$TEST_FILE \
+    --mlm
+```
+
--- a/examples/language-modeling/run_language_modeling.py
+++ b/examples/language-modeling/run_language_modeling.py
@@ -0,0 +1,281 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
+GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
+using a masked language modeling (MLM) loss.
+"""
+
+
+import logging
+import math
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_WITH_LM_HEAD_MAPPING,
+    AutoConfig,
+    AutoModelWithLMHead,
+    AutoTokenizer,
+    DataCollatorForLanguageModeling,
+    HfArgumentParser,
+    LineByLineTextDataset,
+    PreTrainedTokenizer,
+    TextDataset,
+    Trainer,
+    TrainingArguments,
+    set_seed,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    train_data_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a text file)."}
+    )
+    eval_data_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+
+    mlm: bool = field(
+        default=False, metadata={"help": "Train with masked-language modeling loss instead of language modeling."}
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+
+    block_size: int = field(
+        default=-1,
+        metadata={
+            "help": "Optional input sequence length after tokenization."
+            "The training dataset will be truncated in block of this size for training."
+            "Default to the model max input length for single sentence inputs (take into account special tokens)."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+
+
+def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False):
+    file_path = args.eval_data_file if evaluate else args.train_data_file
+    if args.line_by_line:
+        return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
+    else:
+        return TextDataset(
+            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache
+        )
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if data_args.eval_data_file is None and training_args.do_eval:
+        raise ValueError(
+            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
+            "or remove the --do_eval argument."
+        )
+
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
+        )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        training_args.local_rank,
+        training_args.device,
+        training_args.n_gpu,
+        bool(training_args.local_rank != -1),
+        training_args.fp16,
+    )
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed
+    set_seed(training_args.seed)
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
+            "and load it from here, using --tokenizer_name"
+        )
+
+    if model_args.model_name_or_path:
+        model = AutoModelWithLMHead.from_pretrained(
+            model_args.model_name_or_path,
+            from_tf=bool(".ckpt" in model_args.model_name_or_path),
+            config=config,
+            cache_dir=model_args.cache_dir,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelWithLMHead.from_config(config)
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm:
+        raise ValueError(
+            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
+            "flag (masked language modeling)."
+        )
+
+    if data_args.block_size <= 0:
+        data_args.block_size = tokenizer.max_len
+        # Our input block size will be the max possible for the model
+    else:
+        data_args.block_size = min(data_args.block_size, tokenizer.max_len)
+
+    # Get datasets
+
+    train_dataset = get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None
+    eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
+    )
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        data_collator=data_collator,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        prediction_loss_only=True,
+    )
+
+    # Training
+    if training_args.do_train:
+        model_path = (
+            model_args.model_name_or_path
+            if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)
+            else None
+        )
+        trainer.train(model_path=model_path)
+        trainer.save_model()
+        # For convenience, we also re-save the tokenizer to the same directory,
+        # so that you can share your model easily on huggingface.co/models =)
+        if trainer.is_world_master():
+            tokenizer.save_pretrained(training_args.output_dir)
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        eval_output = trainer.evaluate()
+
+        perplexity = math.exp(eval_output["eval_loss"])
+        result = {"perplexity": perplexity}
+
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
+        if trainer.is_world_master():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key in sorted(result.keys()):
+                    logger.info("  %s = %s", key, str(result[key]))
+                    writer.write("%s = %s\n" % (key, str(result[key])))
+
+        results.update(result)
+
+    return results
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/lightning_base.py
+++ b/examples/lightning_base.py
@@ -0,0 +1,280 @@
+import argparse
+import logging
+import os
+import random
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+
+from transformers import (
+    AdamW,
+    AutoConfig,
+    AutoModel,
+    AutoModelForPreTraining,
+    AutoModelForQuestionAnswering,
+    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
+    AutoModelWithLMHead,
+    AutoTokenizer,
+    get_linear_schedule_with_warmup,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+MODEL_MODES = {
+    "base": AutoModel,
+    "sequence-classification": AutoModelForSequenceClassification,
+    "question-answering": AutoModelForQuestionAnswering,
+    "pretraining": AutoModelForPreTraining,
+    "token-classification": AutoModelForTokenClassification,
+    "language-modeling": AutoModelWithLMHead,
+}
+
+
+def set_seed(args: argparse.Namespace):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+class BaseTransformer(pl.LightningModule):
+    def __init__(self, hparams: argparse.Namespace, num_labels=None, mode="base", **config_kwargs):
+        "Initialize a model."
+
+        super().__init__()
+        self.hparams = hparams
+        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
+        self.config = AutoConfig.from_pretrained(
+            self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path,
+            **({"num_labels": num_labels} if num_labels is not None else {}),
+            cache_dir=cache_dir,
+            **config_kwargs,
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path,
+            cache_dir=cache_dir,
+        )
+        self.model = MODEL_MODES[mode].from_pretrained(
+            self.hparams.model_name_or_path,
+            from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
+            config=self.config,
+            cache_dir=cache_dir,
+        )
+
+    def is_logger(self):
+        return self.trainer.proc_rank <= 0
+
+    def configure_optimizers(self):
+        "Prepare optimizer and schedule (linear warmup and decay)"
+
+        model = self.model
+        no_decay = ["bias", "LayerNorm.weight"]
+        optimizer_grouped_parameters = [
+            {
+                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+                "weight_decay": self.hparams.weight_decay,
+            },
+            {
+                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+                "weight_decay": 0.0,
+            },
+        ]
+        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
+        self.opt = optimizer
+        return [optimizer]
+
+    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
+        if self.trainer.use_tpu:
+            xm.optimizer_step(optimizer)
+        else:
+            optimizer.step()
+        optimizer.zero_grad()
+        self.lr_scheduler.step()
+
+    def get_tqdm_dict(self):
+        avg_loss = getattr(self.trainer, "avg_loss", 0.0)
+        tqdm_dict = {"loss": "{:.3f}".format(avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}
+        return tqdm_dict
+
+    def test_step(self, batch, batch_nb):
+        return self.validation_step(batch, batch_nb)
+
+    def test_end(self, outputs):
+        return self.validation_end(outputs)
+
+    def train_dataloader(self):
+        train_batch_size = self.hparams.train_batch_size
+        dataloader = self.load_dataset("train", train_batch_size)
+
+        t_total = (
+            (len(dataloader.dataset) // (train_batch_size * max(1, self.hparams.n_gpu)))
+            // self.hparams.gradient_accumulation_steps
+            * float(self.hparams.num_train_epochs)
+        )
+        scheduler = get_linear_schedule_with_warmup(
+            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
+        )
+        self.lr_scheduler = scheduler
+        return dataloader
+
+    def val_dataloader(self):
+        return self.load_dataset("dev", self.hparams.eval_batch_size)
+
+    def test_dataloader(self):
+        return self.load_dataset("test", self.hparams.eval_batch_size)
+
+    def _feature_file(self, mode):
+        return os.path.join(
+            self.hparams.data_dir,
+            "cached_{}_{}_{}".format(
+                mode,
+                list(filter(None, self.hparams.model_name_or_path.split("/"))).pop(),
+                str(self.hparams.max_seq_length),
+            ),
+        )
+
+    @staticmethod
+    def add_model_specific_args(parser, root_dir):
+        parser.add_argument(
+            "--model_name_or_path",
+            default=None,
+            type=str,
+            required=True,
+            help="Path to pretrained model or model identifier from huggingface.co/models",
+        )
+        parser.add_argument(
+            "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+        )
+        parser.add_argument(
+            "--tokenizer_name",
+            default="",
+            type=str,
+            help="Pretrained tokenizer name or path if not the same as model_name",
+        )
+        parser.add_argument(
+            "--cache_dir",
+            default="",
+            type=str,
+            help="Where do you want to store the pre-trained models downloaded from s3",
+        )
+        parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+        parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+        parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+        parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+        parser.add_argument(
+            "--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform."
+        )
+
+        parser.add_argument("--train_batch_size", default=32, type=int)
+        parser.add_argument("--eval_batch_size", default=32, type=int)
+
+
+class LoggingCallback(pl.Callback):
+    def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
+        logger.info("***** Validation results *****")
+        if pl_module.is_logger():
+            metrics = trainer.callback_metrics
+            # Log results
+            for key in sorted(metrics):
+                if key not in ["log", "progress_bar"]:
+                    logger.info("{} = {}\n".format(key, str(metrics[key])))
+
+    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
+        logger.info("***** Test results *****")
+
+        if pl_module.is_logger():
+            metrics = trainer.callback_metrics
+
+            # Log and save results to file
+            output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
+            with open(output_test_results_file, "w") as writer:
+                for key in sorted(metrics):
+                    if key not in ["log", "progress_bar"]:
+                        logger.info("{} = {}\n".format(key, str(metrics[key])))
+                        writer.write("{} = {}\n".format(key, str(metrics[key])))
+
+
+def add_generic_args(parser, root_dir):
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+
+    parser.add_argument("--n_gpu", type=int, default=1)
+    parser.add_argument("--n_tpu_cores", type=int, default=0)
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+
+def generic_train(model: BaseTransformer, args: argparse.Namespace):
+    # init model
+    set_seed(args)
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
+        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+
+    checkpoint_callback = pl.callbacks.ModelCheckpoint(
+        filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
+    )
+
+    train_params = dict(
+        accumulate_grad_batches=args.gradient_accumulation_steps,
+        gpus=args.n_gpu,
+        max_epochs=args.num_train_epochs,
+        early_stop_callback=False,
+        gradient_clip_val=args.max_grad_norm,
+        checkpoint_callback=checkpoint_callback,
+        callbacks=[LoggingCallback()],
+    )
+
+    if args.fp16:
+        train_params["use_amp"] = args.fp16
+        train_params["amp_level"] = args.fp16_opt_level
+
+    if args.n_tpu_cores > 0:
+        global xm
+        import torch_xla.core.xla_model as xm
+
+        train_params["num_tpu_cores"] = args.n_tpu_cores
+        train_params["gpus"] = 0
+
+    if args.n_gpu > 1:
+        train_params["distributed_backend"] = "ddp"
+
+    trainer = pl.Trainer(**train_params)
+
+    if args.do_train:
+        trainer.fit(model)
+
+    return trainer
--- a/examples/movement-pruning/README.md
+++ b/examples/movement-pruning/README.md
@@ -0,0 +1,183 @@
+# Movement Pruning: Adaptive Sparsity by Fine-Tuning
+
+*Magnitude pruning is a widely used strategy for reducing model size in pure supervised learning; however, it is less effective in the transfer learning regime that has become standard for state-of-the-art natural language processing applications. We propose the use of *movement pruning*, a simple, deterministic first-order weight pruning method that is more adaptive to pretrained model fine-tuning. Experiments show that when pruning large pretrained language models, movement pruning shows significant improvements in high-sparsity regimes. When combined with distillation, the approach achieves minimal accuracy loss with down to only 3% of the model parameters:*
+
+| Fine-pruning+Distillation<br>(Teacher=BERT-base fine-tuned) | BERT base<br>fine-tuned | Remaining<br>Weights (%) | Magnitude Pruning      | L0 Regularization      | Movement Pruning       | Soft Movement Pruning          |
+| :---:                                                       | :---:                   | :---:                    | :---:                  | :---:                  | :---:                  | :---:                          |
+| SQuAD - Dev<br>EM/F1                                        | 80.4/88.1               | 10%<br>3%                | 70.2/80.1<br>45.5/59.6 | 72.4/81.9<br>64.3/75.8 | 75.6/84.3<br>67.5/78.0 | **76.6/84.9**<br>**72.7/82.3** |
+| MNLI - Dev<br>acc/MM acc                                    | 84.5/84.9               | 10%<br>3%                | 78.3/79.3<br>69.4/70.6 | 78.7/79.7<br>76.0/76.2 | 80.1/80.4<br>76.5/77.4 | **81.2/81.8**<br>**79.5/80.1** |
+| QQP - Dev<br>acc/F1                                         | 91.4/88.4               | 10%<br>3%                | 79.8/65.0<br>72.4/57.8 | 88.1/82.8<br>87.0/81.9 | 89.7/86.2<br>86.1/81.5 | **90.2/86.8**<br>**89.1/85.5** |
+
+This page contains information on how to fine-prune pre-trained models such as `BERT` to obtain extremely sparse models with movement pruning. In contrast to magnitude pruning which selects weights that are far from 0, movement pruning retains weights that are moving away from 0.
+
+For more information, we invite you to check out [our paper](https://arxiv.org/abs/2005.07683).
+You can also have a look at this fun *Explain Like I'm Five* introductory [slide deck](https://www.slideshare.net/VictorSanh/movement-pruning-explain-like-im-five-234205241).
+
+<div align="center">
+<img src="https://www.seekpng.com/png/detail/166-1669328_how-to-make-emmental-cheese-at-home-icooker.png" width="400">
+</div>
+
+## Extreme sparsity and efficient storage
+
+One promise of extreme pruning is to obtain extremely small models that can be easily sent (and stored) on edge devices. By setting weights to 0., we reduce the amount of information we need to store, and thus decreasing the memory size. We are able to obtain extremely sparse fine-pruned models with movement pruning: ~95% of the dense performance with ~5% of total remaining weights in the BERT encoder.
+
+In [this notebook](https://github.com/huggingface/transformers/blob/master/examples/movement-pruning/Saving_PruneBERT.ipynb), we showcase how we can leverage standard tools that exist out-of-the-box to efficiently store an extremely sparse question answering model (only 6% of total remaining weights in the encoder). We are able to reduce the memory size of the encoder **from the 340MB (the orignal dense BERT) to 11MB**, without any additional training of the model (every operation is performed *post fine-pruning*). It is sufficiently small to store it on a [91' floppy disk](https://en.wikipedia.org/wiki/Floptical) 📎!
+
+While movement pruning does not directly optimize for memory footprint (but rather the number of non-null weights), we hypothetize that further memory compression ratios can be achieved with specific quantization aware trainings (see for instance [Q8BERT](https://arxiv.org/abs/1910.06188), [And the Bit Goes Down](https://arxiv.org/abs/1907.05686) or [Quant-Noise](https://arxiv.org/abs/2004.07320)).
+
+## Fine-pruned models
+
+As examples, we release two English PruneBERT checkpoints (models fine-pruned from a pre-trained `BERT` checkpoint), one on SQuAD and the other on MNLI.
+
+- **`prunebert-base-uncased-6-finepruned-w-distil-squad`**<br/>
+Pre-trained `BERT-base-uncased` fine-pruned with soft movement pruning on SQuAD v1.1. We use an additional distillation signal from `BERT-base-uncased` finetuned on SQuAD. The encoder counts 6% of total non-null weights and reaches 83.8 F1 score. The model can be accessed with: `pruned_bert = BertForQuestionAnswering.from_pretrained("huggingface/prunebert-base-uncased-6-finepruned-w-distil-squad")`
+- **`prunebert-base-uncased-6-finepruned-w-distil-mnli`**<br/>
+Pre-trained `BERT-base-uncased` fine-pruned with soft movement pruning on MNLI. We use an additional distillation signal from `BERT-base-uncased` finetuned on MNLI. The encoder counts 6% of total non-null weights and reaches 80.7 (matched) accuracy. The model can be accessed with: `pruned_bert = BertForSequenceClassification.from_pretrained("huggingface/prunebert-base-uncased-6-finepruned-w-distil-mnli")`
+
+## How to fine-prune?
+
+### Setup
+
+The code relies on the 🤗 Transformers library. In addition to the dependencies listed in the [`examples`](https://github.com/huggingface/transformers/tree/master/examples) folder, you should install a few additional dependencies listed in the `requirements.txt` file: `pip install -r requirements.txt`.
+
+Note that we built our experiments on top of a stabilized version of the library (commit https://github.com/huggingface/transformers/commit/352d5472b0c1dec0f420d606d16747d851b4bda8): we do not guarantee that everything is still compatible with the latest version of the master branch.
+
+### Fine-pruning with movement pruning
+
+Below, we detail how to reproduce the results reported in the paper. We use SQuAD as a running example. Commands (and scripts) can be easily adapted for other tasks.
+
+The following command fine-prunes a pre-trained `BERT-base` on SQuAD using movement pruning towards 15% of remaining weights (85% sparsity). Note that we freeze all the embeddings modules (from their pre-trained value) and only prune the Fully Connected layers in the encoder (12 layers of Transformer Block).
+
+```bash
+SERIALIZATION_DIR=<OUTPUT_DIR>
+SQUAD_DATA=<SQUAD_DATA>
+
+python examples/movement-pruning/masked_run_squad.py \
+    --output_dir $SERIALIZATION_DIR \
+    --data_dir $SQUAD_DATA \
+    --train_file train-v1.1.json \
+    --predict_file dev-v1.1.json \
+    --do_train --do_eval --do_lower_case \
+    --model_type masked_bert \
+    --model_name_or_path bert-base-uncased \
+    --per_gpu_train_batch_size 16 \
+    --warmup_steps 5400 \
+    --num_train_epochs 10 \
+    --learning_rate 3e-5 --mask_scores_learning_rate 1e-2 \
+    --initial_threshold 1 --final_threshold 0.15 \
+    --initial_warmup 1 --final_warmup 2 \
+    --pruning_method topK --mask_init constant --mask_scale 0.
+```
+
+### Fine-pruning with other methods
+
+We can also explore other fine-pruning methods by changing the `pruning_method` parameter:
+
+Soft movement pruning
+```bash
+python examples/movement-pruning/masked_run_squad.py \
+    --output_dir $SERIALIZATION_DIR \
+    --data_dir $SQUAD_DATA \
+    --train_file train-v1.1.json \
+    --predict_file dev-v1.1.json \
+    --do_train --do_eval --do_lower_case \
+    --model_type masked_bert \
+    --model_name_or_path bert-base-uncased \
+    --per_gpu_train_batch_size 16 \
+    --warmup_steps 5400 \
+    --num_train_epochs 10 \
+    --learning_rate 3e-5 --mask_scores_learning_rate 1e-2 \
+    --initial_threshold 0 --final_threshold 0.1 \
+    --initial_warmup 1 --final_warmup 2 \
+    --pruning_method sigmoied_threshold --mask_init constant --mask_scale 0. \
+    --regularization l1 --final_lambda 400.
+```
+
+L0 regularization
+```bash
+python examples/movement-pruning/masked_run_squad.py \
+    --output_dir $SERIALIZATION_DIR \
+    --data_dir $SQUAD_DATA \
+    --train_file train-v1.1.json \
+    --predict_file dev-v1.1.json \
+    --do_train --do_eval --do_lower_case \
+    --model_type masked_bert \
+    --model_name_or_path bert-base-uncased \
+    --per_gpu_train_batch_size 16 \
+    --warmup_steps 5400 \
+    --num_train_epochs 10 \
+    --learning_rate 3e-5 --mask_scores_learning_rate 1e-1 \
+    --initial_threshold 1. --final_threshold 1. \
+    --initial_warmup 1 --final_warmup 1 \
+    --pruning_method l0 --mask_init constant --mask_scale 2.197 \
+    --regularization l0 --final_lambda 125.
+```
+
+Iterative Magnitude Pruning
+```bash
+python examples/movement-pruning/masked_run_squad.py \
+    --output_dir ./dbg \
+    --data_dir examples/distillation/data/squad_data \
+    --train_file train-v1.1.json \
+    --predict_file dev-v1.1.json \
+    --do_train --do_eval --do_lower_case \
+    --model_type masked_bert \
+    --model_name_or_path bert-base-uncased \
+    --per_gpu_train_batch_size 16 \
+    --warmup_steps 5400 \
+    --num_train_epochs 10 \
+    --learning_rate 3e-5 \
+    --initial_threshold 1 --final_threshold 0.15 \
+    --initial_warmup 1 --final_warmup 2 \
+    --pruning_method magnitude
+```
+
+### After fine-pruning
+
+**Counting parameters**
+
+Regularization based pruning methods (soft movement pruning and L0 regularization) rely on the penalty to induce sparsity. The multiplicative coefficient controls the sparsity level.
+To obtain the effective sparsity level in the encoder, we simply count the number of activated (non-null) weights:
+
+```bash
+python examples/movement-pruning/count_parameters.py \
+    --pruning_method sigmoied_threshold \
+    --threshold 0.1 \
+    --serialization_dir $SERIALIZATION_DIR
+```
+
+**Pruning once for all**
+
+Once the model has been fine-pruned, the pruned weights can be set to 0. once for all (reducing the amount of information to store). In our running experiments, we can convert a `MaskedBertForQuestionAnswering` (a BERT model augmented to enable on-the-fly pruning capabilities) to a standard `BertForQuestionAnswering`:
+
+```bash
+python examples/movement-pruning/bertarize.py \
+    --pruning_method sigmoied_threshold \
+    --threshold 0.1 \
+    --model_name_or_path $SERIALIZATION_DIR
+```
+
+## Hyper-parameters
+
+For reproducibility purposes, we share the detailed results presented in the paper. These [tables](https://docs.google.com/spreadsheets/d/17JgRq_OFFTniUrz6BZWW_87DjFkKXpI1kYDSsseT_7g/edit?usp=sharing) exhaustively describe the individual hyper-parameters used for each data point.
+
+## Inference speed
+
+Early experiments show that even though models fine-pruned with (soft) movement pruning are extremely sparse, they do not benefit from significant improvement in terms of inference speed when using the standard PyTorch inference.
+We are currently benchmarking and exploring inference setups specifically for sparse architectures.
+In particular, hardware manufacturers are announcing devices that will speedup inference for sparse networks considerably.
+
+## Citation
+
+If you find this resource useful, please consider citing the following paper:
+
+```
+@article{sanh2020movement,
+    title={Movement Pruning: Adaptive Sparsity by Fine-Tuning},
+    author={Victor Sanh and Thomas Wolf and Alexander M. Rush},
+    year={2020},
+    eprint={2005.07683},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
--- a/examples/movement-pruning/Saving_PruneBERT.ipynb
+++ b/examples/movement-pruning/Saving_PruneBERT.ipynb
@@ -0,0 +1,612 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Saving PruneBERT\n",
+    "\n",
+    "\n",
+    "This notebook aims at showcasing how we can leverage standard tools to save (and load) an extremely sparse model fine-pruned with [movement pruning](https://arxiv.org/abs/2005.07683) (or any other unstructured pruning mehtod).\n",
+    "\n",
+    "In this example, we used BERT (base-uncased, but the procedure described here is not specific to BERT and can be applied to a large variety of models.\n",
+    "\n",
+    "We first obtain an extremely sparse model by fine-pruning with movement pruning on SQuAD v1.1. We then used the following combination of standard tools:\n",
+    "- We reduce the precision of the model with Int8 dynamic quantization using [PyTorch implementation](https://pytorch.org/tutorials/intermediate/dynamic_quantization_bert_tutorial.html). We only quantized the Fully Connected Layers.\n",
+    "- Sparse quantized matrices are converted into the [Compressed Sparse Row format](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html).\n",
+    "- We use HDF5 with `gzip` compression to store the weights.\n",
+    "\n",
+    "We experiment with a question answering model with only 6% of total remaining weights in the encoder (previously obtained with movement pruning). **We are able to reduce the memory size of the encoder from 340MB (original dense BERT) to 11MB**, which fits on a [91' floppy disk](https://en.wikipedia.org/wiki/Floptical)!\n",
+    "\n",
+    "<img src=\"https://upload.wikimedia.org/wikipedia/commons/thumb/0/00/Floptical_disk_21MB.jpg/440px-Floptical_disk_21MB.jpg\" width=\"200\">"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Includes\n",
+    "\n",
+    "import h5py\n",
+    "import os\n",
+    "import json\n",
+    "from collections import OrderedDict\n",
+    "\n",
+    "from scipy import sparse\n",
+    "import numpy as np\n",
+    "\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "\n",
+    "from transformers import *\n",
+    "\n",
+    "os.chdir('../../')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Saving"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Dynamic quantization induces little or no loss of performance while significantly reducing the memory footprint."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load fine-pruned model and quantize the model\n",
+    "\n",
+    "model_path = \"serialization_dir/bert-base-uncased/92/squad/l1\"\n",
+    "model_name = \"bertarized_l1_with_distil_0._0.1_1_2_l1_1100._3e-5_1e-2_sigmoied_threshold_constant_0._10_epochs\"\n",
+    "\n",
+    "model = BertForQuestionAnswering.from_pretrained(os.path.join(model_path, model_name))\n",
+    "model.to('cpu')\n",
+    "\n",
+    "quantized_model = torch.quantization.quantize_dynamic(\n",
+    "                    model=model,\n",
+    "                    qconfig_spec = {\n",
+    "                        torch.nn.Linear : torch.quantization.default_dynamic_qconfig,\n",
+    "                    },\n",
+    "                    dtype=torch.qint8,\n",
+    "                )\n",
+    "# print(quantized_model)\n",
+    "\n",
+    "qtz_st = quantized_model.state_dict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Saving the original (encoder + classifier) in the standard torch.save format\n",
+    "\n",
+    "dense_st = {name: param for name, param in model.state_dict().items() \n",
+    "                            if \"embedding\" not in name and \"pooler\" not in name}\n",
+    "torch.save(dense_st, 'dbg/dense_squad.pt',)\n",
+    "dense_mb_size = os.path.getsize(\"dbg/dense_squad.pt\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Decompose quantization for bert.encoder.layer.0.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.0.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.0.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.0.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.0.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.0.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.1.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.1.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.1.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.1.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.1.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.1.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.2.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.2.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.2.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.2.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.2.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.2.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.3.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.3.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.3.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.3.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.3.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.3.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.4.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.4.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.4.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.4.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.4.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.4.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.5.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.5.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.5.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.5.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.5.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.5.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.6.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.6.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.6.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.6.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.6.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.6.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.7.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.7.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.7.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.7.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.7.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.7.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.8.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.8.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.8.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.8.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.8.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.8.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.9.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.9.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.9.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.9.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.9.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.9.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.10.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.10.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.10.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.10.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.10.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.10.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.11.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.11.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.11.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.11.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.11.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.11.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.pooler.dense._packed_params.weight\n",
+      "Decompose quantization for qa_outputs._packed_params.weight\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Elementary representation: we decompose the quantized tensors into (scale, zero_point, int_repr).\n",
+    "# See https://pytorch.org/docs/stable/quantization.html\n",
+    "\n",
+    "# We further leverage the fact that int_repr is sparse matrix to optimize the storage: we decompose int_repr into\n",
+    "# its CSR representation (data, indptr, indices).\n",
+    "\n",
+    "elementary_qtz_st = {}\n",
+    "for name, param in qtz_st.items():\n",
+    "    if param.is_quantized:\n",
+    "        print(\"Decompose quantization for\", name)\n",
+    "        # We need to extract the scale, the zero_point and the int_repr for the quantized tensor and modules\n",
+    "        scale = param.q_scale()                                # torch.tensor(1,) - float32\n",
+    "        zero_point = param.q_zero_point()                      # torch.tensor(1,) - int32\n",
+    "        elementary_qtz_st[f\"{name}.scale\"] = scale\n",
+    "        elementary_qtz_st[f\"{name}.zero_point\"] = zero_point\n",
+    "\n",
+    "        # We assume the int_repr is sparse and compute its CSR representation\n",
+    "        # Only the FCs in the encoder are actually sparse\n",
+    "        int_repr = param.int_repr()                         # torch.tensor(nb_rows, nb_columns) - int8\n",
+    "        int_repr_cs = sparse.csr_matrix(int_repr)           # scipy.sparse.csr.csr_matrix\n",
+    "\n",
+    "        elementary_qtz_st[f\"{name}.int_repr.data\"] = int_repr_cs.data                  # np.array int8\n",
+    "        elementary_qtz_st[f\"{name}.int_repr.indptr\"] = int_repr_cs.indptr              # np.array int32\n",
+    "        assert max(int_repr_cs.indices) < 65535 # If not, we shall fall back to int32\n",
+    "        elementary_qtz_st[f\"{name}.int_repr.indices\"] = np.uint16(int_repr_cs.indices) # np.array uint16\n",
+    "        elementary_qtz_st[f\"{name}.int_repr.shape\"] = int_repr_cs.shape                # tuple(int, int)\n",
+    "    else:\n",
+    "        elementary_qtz_st[name] = param\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Encoder Size (MB) - Sparse & Quantized - `torch.save`: 21.29\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Saving the pruned (encoder + classifier) in the standard torch.save format\n",
+    "\n",
+    "dense_optimized_st = {name: param for name, param in elementary_qtz_st.items() \n",
+    "                                    if \"embedding\" not in name and \"pooler\" not in name}\n",
+    "torch.save(dense_optimized_st, 'dbg/dense_squad_optimized.pt',)\n",
+    "print(\"Encoder Size (MB) - Sparse & Quantized - `torch.save`:\",\n",
+    "      round(os.path.getsize(\"dbg/dense_squad_optimized.pt\")/1e6, 2))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Skip bert.embeddings.word_embeddings.weight\n",
+      "Skip bert.embeddings.position_embeddings.weight\n",
+      "Skip bert.embeddings.token_type_embeddings.weight\n",
+      "Skip bert.embeddings.LayerNorm.weight\n",
+      "Skip bert.embeddings.LayerNorm.bias\n",
+      "Skip bert.pooler.dense.scale\n",
+      "Skip bert.pooler.dense.zero_point\n",
+      "Skip bert.pooler.dense._packed_params.weight.scale\n",
+      "Skip bert.pooler.dense._packed_params.weight.zero_point\n",
+      "Skip bert.pooler.dense._packed_params.weight.int_repr.data\n",
+      "Skip bert.pooler.dense._packed_params.weight.int_repr.indptr\n",
+      "Skip bert.pooler.dense._packed_params.weight.int_repr.indices\n",
+      "Skip bert.pooler.dense._packed_params.weight.int_repr.shape\n",
+      "Skip bert.pooler.dense._packed_params.bias\n",
+      "\n",
+      "Encoder Size (MB) - Dense:              340.25\n",
+      "Encoder Size (MB) - Sparse & Quantized: 11.27\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Save the decomposed state_dict with an HDF5 file\n",
+    "# Saving only the encoder + QA Head\n",
+    "\n",
+    "with h5py.File('dbg/squad_sparse.h5','w') as hf:\n",
+    "    for name, param in elementary_qtz_st.items():\n",
+    "        if \"embedding\" in name:\n",
+    "            print(f\"Skip {name}\")\n",
+    "            continue\n",
+    "\n",
+    "        if \"pooler\" in name:\n",
+    "            print(f\"Skip {name}\")\n",
+    "            continue\n",
+    "\n",
+    "        if type(param) == torch.Tensor:\n",
+    "            if param.numel() == 1:\n",
+    "                # module scale\n",
+    "                # module zero_point\n",
+    "                hf.attrs[name] = param\n",
+    "                continue\n",
+    "\n",
+    "            if param.requires_grad:\n",
+    "                # LayerNorm\n",
+    "                param = param.detach().numpy()\n",
+    "            hf.create_dataset(name, data=param, compression=\"gzip\", compression_opts=9)\n",
+    "\n",
+    "        elif type(param) == float or type(param) == int or type(param) == tuple:\n",
+    "            # float - tensor _packed_params.weight.scale\n",
+    "            # int   - tensor_packed_params.weight.zero_point\n",
+    "            # tuple - tensor _packed_params.weight.shape\n",
+    "            hf.attrs[name] = param\n",
+    "\n",
+    "        else:\n",
+    "            hf.create_dataset(name, data=param, compression=\"gzip\", compression_opts=9)\n",
+    "\n",
+    "\n",
+    "with open('dbg/metadata.json', 'w') as f:\n",
+    "    f.write(json.dumps(qtz_st._metadata))  \n",
+    "\n",
+    "size = os.path.getsize(\"dbg/squad_sparse.h5\") + os.path.getsize(\"dbg/metadata.json\")\n",
+    "print(\"\")\n",
+    "print(\"Encoder Size (MB) - Dense:             \", round(dense_mb_size/1e6, 2))\n",
+    "print(\"Encoder Size (MB) - Sparse & Quantized:\", round(size/1e6, 2))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Size (MB): 99.39\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Save the decomposed state_dict to HDF5 storage\n",
+    "# Save everything in the architecutre (embedding + encoder + QA Head)\n",
+    "\n",
+    "with h5py.File('dbg/squad_sparse_with_embs.h5','w') as hf:\n",
+    "    for name, param in elementary_qtz_st.items():\n",
+    "#         if \"embedding\" in name:\n",
+    "#             print(f\"Skip {name}\")\n",
+    "#             continue\n",
+    "\n",
+    "#         if \"pooler\" in name:\n",
+    "#             print(f\"Skip {name}\")\n",
+    "#             continue\n",
+    "\n",
+    "        if type(param) == torch.Tensor:\n",
+    "            if param.numel() == 1:\n",
+    "                # module scale\n",
+    "                # module zero_point\n",
+    "                hf.attrs[name] = param\n",
+    "                continue\n",
+    "\n",
+    "            if param.requires_grad:\n",
+    "                # LayerNorm\n",
+    "                param = param.detach().numpy()\n",
+    "            hf.create_dataset(name, data=param, compression=\"gzip\", compression_opts=9)\n",
+    "\n",
+    "        elif type(param) == float or type(param) == int or type(param) == tuple:\n",
+    "            # float - tensor _packed_params.weight.scale\n",
+    "            # int   - tensor _packed_params.weight.zero_point\n",
+    "            # tuple - tensor _packed_params.weight.shape\n",
+    "            hf.attrs[name] = param\n",
+    "\n",
+    "        else:\n",
+    "            hf.create_dataset(name, data=param, compression=\"gzip\", compression_opts=9)\n",
+    "\n",
+    "\n",
+    "with open('dbg/metadata.json', 'w') as f:\n",
+    "    f.write(json.dumps(qtz_st._metadata))   \n",
+    "\n",
+    "size = os.path.getsize(\"dbg/squad_sparse_with_embs.h5\") + os.path.getsize(\"dbg/metadata.json\")\n",
+    "print('\\nSize (MB):', round(size/1e6, 2))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reconstruct the elementary state dict\n",
+    "\n",
+    "reconstructed_elementary_qtz_st = {}\n",
+    "\n",
+    "hf = h5py.File('dbg/squad_sparse_with_embs.h5','r')\n",
+    "\n",
+    "for attr_name, attr_param in hf.attrs.items():\n",
+    "    if 'shape' in attr_name:\n",
+    "        attr_param = tuple(attr_param)\n",
+    "    elif \".scale\" in attr_name:\n",
+    "        if \"_packed_params\" in attr_name:\n",
+    "            attr_param = float(attr_param)\n",
+    "        else:\n",
+    "            attr_param = torch.tensor(attr_param)\n",
+    "    elif \".zero_point\" in attr_name:\n",
+    "        if \"_packed_params\" in attr_name:\n",
+    "            attr_param = int(attr_param)\n",
+    "        else:\n",
+    "            attr_param = torch.tensor(attr_param)\n",
+    "    reconstructed_elementary_qtz_st[attr_name] = attr_param\n",
+    "    # print(f\"Unpack {attr_name}\")\n",
+    "    \n",
+    "# Get the tensors/arrays\n",
+    "for data_name, data_param in hf.items():\n",
+    "    if \"LayerNorm\" in data_name or \"_packed_params.bias\" in data_name:\n",
+    "        reconstructed_elementary_qtz_st[data_name] = torch.from_numpy(np.array(data_param))\n",
+    "    elif \"embedding\" in data_name:\n",
+    "        reconstructed_elementary_qtz_st[data_name] = torch.from_numpy(np.array(data_param))\n",
+    "    else: # _packed_params.weight.int_repr.data, _packed_params.weight.int_repr.indices and _packed_params.weight.int_repr.indptr\n",
+    "        data_param = np.array(data_param)\n",
+    "        if \"indices\" in data_name:\n",
+    "            data_param = np.array(data_param, dtype=np.int32)\n",
+    "        reconstructed_elementary_qtz_st[data_name] = data_param\n",
+    "    # print(f\"Unpack {data_name}\")\n",
+    "    \n",
+    "\n",
+    "hf.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sanity checks\n",
+    "\n",
+    "for name, param in reconstructed_elementary_qtz_st.items():\n",
+    "    assert name in elementary_qtz_st\n",
+    "for name, param in elementary_qtz_st.items():\n",
+    "    assert name in reconstructed_elementary_qtz_st, name\n",
+    "\n",
+    "for name, param in reconstructed_elementary_qtz_st.items():\n",
+    "    assert type(param) == type(elementary_qtz_st[name]), name\n",
+    "    if type(param) == torch.Tensor:\n",
+    "        assert torch.all(torch.eq(param, elementary_qtz_st[name])), name\n",
+    "    elif type(param) == np.ndarray:\n",
+    "        assert (param == elementary_qtz_st[name]).all(), name\n",
+    "    else:\n",
+    "        assert param == elementary_qtz_st[name], name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Re-assemble the sparse int_repr from the CSR format\n",
+    "\n",
+    "reconstructed_qtz_st = {}\n",
+    "\n",
+    "for name, param in reconstructed_elementary_qtz_st.items():\n",
+    "    if \"weight.int_repr.indptr\" in name:\n",
+    "        prefix_ = name[:-16]\n",
+    "        data    = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.data\"]\n",
+    "        indptr  = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.indptr\"]\n",
+    "        indices = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.indices\"]\n",
+    "        shape   = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.shape\"]\n",
+    "\n",
+    "        int_repr = sparse.csr_matrix(arg1=(data, indices, indptr),\n",
+    "                                     shape=shape)\n",
+    "        int_repr = torch.tensor(int_repr.todense())\n",
+    "\n",
+    "        scale = reconstructed_elementary_qtz_st[f\"{prefix_}.scale\"]\n",
+    "        zero_point = reconstructed_elementary_qtz_st[f\"{prefix_}.zero_point\"]\n",
+    "        weight = torch._make_per_tensor_quantized_tensor(int_repr,\n",
+    "                                                         scale,\n",
+    "                                                         zero_point)\n",
+    "\n",
+    "        reconstructed_qtz_st[f\"{prefix_}\"] = weight\n",
+    "    elif \"int_repr.data\" in name or \"int_repr.shape\" in name or \"int_repr.indices\" in name or \\\n",
+    "         \"weight.scale\" in name or \"weight.zero_point\" in name:\n",
+    "        continue\n",
+    "    else:\n",
+    "        reconstructed_qtz_st[name] = param\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sanity checks\n",
+    "\n",
+    "for name, param in reconstructed_qtz_st.items():\n",
+    "    assert name in qtz_st\n",
+    "for name, param in qtz_st.items():\n",
+    "    assert name in reconstructed_qtz_st, name\n",
+    "\n",
+    "for name, param in reconstructed_qtz_st.items():\n",
+    "    assert type(param) == type(qtz_st[name]), name\n",
+    "    if type(param) == torch.Tensor:\n",
+    "        assert torch.all(torch.eq(param, qtz_st[name])), name\n",
+    "    elif type(param) == np.ndarray:\n",
+    "        assert (param == qtz_st[name]).all(), name\n",
+    "    else:\n",
+    "        assert param == qtz_st[name], name"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Sanity checks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<All keys matched successfully>"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Load the re-constructed state dict into a model\n",
+    "\n",
+    "dummy_model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')\n",
+    "dummy_model.to('cpu')\n",
+    "\n",
+    "reconstructed_qtz_model = torch.quantization.quantize_dynamic(\n",
+    "                            model=dummy_model,\n",
+    "                            qconfig_spec = None,\n",
+    "                            dtype=torch.qint8,\n",
+    "                          )\n",
+    "\n",
+    "reconstructed_qtz_st = OrderedDict(reconstructed_qtz_st)\n",
+    "with open('dbg/metadata.json', 'r') as read_file:\n",
+    "    metadata = json.loads(read_file.read())\n",
+    "reconstructed_qtz_st._metadata = metadata\n",
+    "\n",
+    "reconstructed_qtz_model.load_state_dict(reconstructed_qtz_st)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sanity check passed\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Sanity checks on the infernce\n",
+    "\n",
+    "N = 32\n",
+    "\n",
+    "for _ in range(25):\n",
+    "    inputs = torch.randint(low=0, high=30000, size=(N, 128))\n",
+    "    mask = torch.ones(size=(N, 128))\n",
+    "\n",
+    "    y_reconstructed = reconstructed_qtz_model(input_ids=inputs, attention_mask=mask)[0]\n",
+    "    y               = quantized_model(input_ids=inputs, attention_mask=mask)[0]\n",
+    "    \n",
+    "    assert torch.all(torch.eq(y, y_reconstructed))\n",
+    "print(\"Sanity check passed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/examples/movement-pruning/bertarize.py
+++ b/examples/movement-pruning/bertarize.py
@@ -0,0 +1,132 @@
+# Copyright 2020-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Once a model has been fine-pruned, the weights that are masked during the forward pass can be pruned once for all.
+For instance, once the a model from the :class:`~emmental.MaskedBertForSequenceClassification` is trained, it can be saved (and then loaded)
+as a standard :class:`~transformers.BertForSequenceClassification`.
+"""
+
+import argparse
+import os
+import shutil
+
+import torch
+
+from emmental.modules import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
+
+
+def main(args):
+    pruning_method = args.pruning_method
+    threshold = args.threshold
+
+    model_name_or_path = args.model_name_or_path.rstrip("/")
+    target_model_path = args.target_model_path
+
+    print(f"Load fine-pruned model from {model_name_or_path}")
+    model = torch.load(os.path.join(model_name_or_path, "pytorch_model.bin"))
+    pruned_model = {}
+
+    for name, tensor in model.items():
+        if "embeddings" in name or "LayerNorm" in name or "pooler" in name:
+            pruned_model[name] = tensor
+            print(f"Copied layer {name}")
+        elif "classifier" in name or "qa_output" in name:
+            pruned_model[name] = tensor
+            print(f"Copied layer {name}")
+        elif "bias" in name:
+            pruned_model[name] = tensor
+            print(f"Copied layer {name}")
+        else:
+            if pruning_method == "magnitude":
+                mask = MagnitudeBinarizer.apply(inputs=tensor, threshold=threshold)
+                pruned_model[name] = tensor * mask
+                print(f"Pruned layer {name}")
+            elif pruning_method == "topK":
+                if "mask_scores" in name:
+                    continue
+                prefix_ = name[:-6]
+                scores = model[f"{prefix_}mask_scores"]
+                mask = TopKBinarizer.apply(scores, threshold)
+                pruned_model[name] = tensor * mask
+                print(f"Pruned layer {name}")
+            elif pruning_method == "sigmoied_threshold":
+                if "mask_scores" in name:
+                    continue
+                prefix_ = name[:-6]
+                scores = model[f"{prefix_}mask_scores"]
+                mask = ThresholdBinarizer.apply(scores, threshold, True)
+                pruned_model[name] = tensor * mask
+                print(f"Pruned layer {name}")
+            elif pruning_method == "l0":
+                if "mask_scores" in name:
+                    continue
+                prefix_ = name[:-6]
+                scores = model[f"{prefix_}mask_scores"]
+                l, r = -0.1, 1.1
+                s = torch.sigmoid(scores)
+                s_bar = s * (r - l) + l
+                mask = s_bar.clamp(min=0.0, max=1.0)
+                pruned_model[name] = tensor * mask
+                print(f"Pruned layer {name}")
+            else:
+                raise ValueError("Unknown pruning method")
+
+    if target_model_path is None:
+        target_model_path = os.path.join(
+            os.path.dirname(model_name_or_path), f"bertarized_{os.path.basename(model_name_or_path)}"
+        )
+
+    if not os.path.isdir(target_model_path):
+        shutil.copytree(model_name_or_path, target_model_path)
+        print(f"\nCreated folder {target_model_path}")
+
+    torch.save(pruned_model, os.path.join(target_model_path, "pytorch_model.bin"))
+    print("\nPruned model saved! See you later!")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--pruning_method",
+        choices=["l0", "magnitude", "topK", "sigmoied_threshold"],
+        type=str,
+        required=True,
+        help="Pruning Method (l0 = L0 regularization, magnitude = Magnitude pruning, topK = Movement pruning, sigmoied_threshold = Soft movement pruning)",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        required=False,
+        help="For `magnitude` and `topK`, it is the level of remaining weights (in %) in the fine-pruned model."
+        "For `sigmoied_threshold`, it is the threshold \tau against which the (sigmoied) scores are compared."
+        "Not needed for `l0`",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        required=True,
+        help="Folder containing the model that was previously fine-pruned",
+    )
+    parser.add_argument(
+        "--target_model_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Folder containing the model that was previously fine-pruned",
+    )
+
+    args = parser.parse_args()
+
+    main(args)
--- a/examples/movement-pruning/counts_parameters.py
+++ b/examples/movement-pruning/counts_parameters.py
@@ -0,0 +1,92 @@
+# Copyright 2020-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Count remaining (non-zero) weights in the encoder (i.e. the transformer layers).
+Sparsity and remaining weights levels are equivalent: sparsity % = 100 - remaining weights %.
+"""
+import argparse
+import os
+
+import torch
+
+from emmental.modules import ThresholdBinarizer, TopKBinarizer
+
+
+def main(args):
+    serialization_dir = args.serialization_dir
+    pruning_method = args.pruning_method
+    threshold = args.threshold
+
+    st = torch.load(os.path.join(serialization_dir, "pytorch_model.bin"), map_location="cpu")
+
+    remaining_count = 0  # Number of remaining (not pruned) params in the encoder
+    encoder_count = 0  # Number of params in the encoder
+
+    print("name".ljust(60, " "), "Remaining Weights %", "Remaning Weight")
+    for name, param in st.items():
+        if "encoder" not in name:
+            continue
+
+        if "mask_scores" in name:
+            if pruning_method == "topK":
+                mask_ones = TopKBinarizer.apply(param, threshold).sum().item()
+            elif pruning_method == "sigmoied_threshold":
+                mask_ones = ThresholdBinarizer.apply(param, threshold, True).sum().item()
+            elif pruning_method == "l0":
+                l, r = -0.1, 1.1
+                s = torch.sigmoid(param)
+                s_bar = s * (r - l) + l
+                mask = s_bar.clamp(min=0.0, max=1.0)
+                mask_ones = (mask > 0.0).sum().item()
+            else:
+                raise ValueError("Unknown pruning method")
+            remaining_count += mask_ones
+            print(name.ljust(60, " "), str(round(100 * mask_ones / param.numel(), 3)).ljust(20, " "), str(mask_ones))
+        else:
+            encoder_count += param.numel()
+            if "bias" in name or "LayerNorm" in name:
+                remaining_count += param.numel()
+
+    print("")
+    print("Remaining Weights (global) %: ", 100 * remaining_count / encoder_count)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--pruning_method",
+        choices=["l0", "topK", "sigmoied_threshold"],
+        type=str,
+        required=True,
+        help="Pruning Method (l0 = L0 regularization, topK = Movement pruning, sigmoied_threshold = Soft movement pruning)",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        required=False,
+        help="For `topK`, it is the level of remaining weights (in %) in the fine-pruned model."
+        "For `sigmoied_threshold`, it is the threshold \tau against which the (sigmoied) scores are compared."
+        "Not needed for `l0`",
+    )
+    parser.add_argument(
+        "--serialization_dir",
+        type=str,
+        required=True,
+        help="Folder containing the model that was previously fine-pruned",
+    )
+
+    args = parser.parse_args()
+
+    main(args)
--- a/examples/movement-pruning/emmental/init.py
+++ b/examples/movement-pruning/emmental/init.py
@@ -0,0 +1,10 @@
+# flake8: noqa
+from .configuration_bert_masked import MaskedBertConfig
+from .modeling_bert_masked import (
+    MaskedBertForMultipleChoice,
+    MaskedBertForQuestionAnswering,
+    MaskedBertForSequenceClassification,
+    MaskedBertForTokenClassification,
+    MaskedBertModel,
+)
+from .modules import *
--- a/examples/movement-pruning/emmental/configuration_bert_masked.py
+++ b/examples/movement-pruning/emmental/configuration_bert_masked.py
@@ -0,0 +1,71 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Masked BERT model configuration. It replicates the class `~transformers.BertConfig`
+and adapts it to the specificities of MaskedBert (`pruning_method`, `mask_init` and `mask_scale`."""
+
+
+import logging
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+class MaskedBertConfig(PretrainedConfig):
+    """
+    A class replicating the `~transformers.BertConfig` with additional parameters for pruning/masking configuration.
+    """
+
+    model_type = "masked_bert"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        pruning_method="topK",
+        mask_init="constant",
+        mask_scale=0.0,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.pruning_method = pruning_method
+        self.mask_init = mask_init
+        self.mask_scale = mask_scale
--- a/examples/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/movement-pruning/emmental/modeling_bert_masked.py
--- a/examples/movement-pruning/emmental/modules/init.py
+++ b/examples/movement-pruning/emmental/modules/init.py
@@ -0,0 +1,3 @@
+# flake8: noqa
+from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
+from .masked_nn import MaskedLinear
--- a/Show More
+++ b/Show More