Fix CI after killing archive maps (#4724 )

* 🐛 Fix model ids for BART and Flaubert
Release: v2.11.0
2020-06-02 10:21:09 -04:00 · 2020-06-02 09:49:09 -04:00 · 2020-06-02 09:39:33 -04:00 · 2020-06-02 11:03:46 +02:00 · 2020-06-02 11:02:27 +02:00 · 2020-06-02 04:29:28 -04:00
553 changed files with 45510 additions and 10828 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -66,6 +66,16 @@ jobs:
            - run: sudo pip install .[sklearn,torch,testing]
            - run: sudo pip install -r examples/requirements.txt
            - run: python -m pytest -n 8 --dist=loadfile -s -v ./examples/
+    build_doc:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        steps:
+            - checkout
+            - run: sudo pip install .[tf,torch,docs]
+            - run: cd docs && make html
+            - store_artifacts:
+                path: ./docs/_build
    deploy_doc:
        working_directory: ~/transformers
        docker:
@@ -85,6 +95,8 @@ jobs:
        parallelism: 1
        steps:
            - checkout
+            # we need a version of isort with https://github.com/timothycrosley/isort/pull/1000
+            - run: sudo pip install git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
            - run: sudo pip install .[tf,torch,quality]
            - run: black --check --line-length 119 --target-version py35 examples templates tests src utils
            - run: isort --check-only --recursive examples templates tests src utils
@@ -115,4 +127,5 @@ workflows:
            - run_tests_torch_and_tf
            - run_tests_torch
            - run_tests_tf
+            - build_doc
            - deploy_doc: *workflow_filters
--- a/.github/ISSUE_TEMPLATE/--new-model-addition.md
+++ b/.github/ISSUE_TEMPLATE/--new-model-addition.md
@@ -2,7 +2,7 @@
 name: "\U0001F31F New model addition"
 about: Submit a proposal/request to implement a new Transformer-based model
 title: ''
-labels: ''
+labels: New model
 assignees: ''

 ---
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -40,7 +40,7 @@ Steps to reproduce the behavior:
 <!-- A clear and concise description of what you would expect to happen. -->

 ## Environment info
-<!-- You can run the command `python transformers-cli env` and copy-and-paste its output below.
+<!-- You can run the command `transformers-cli env` and copy-and-paste its output below.
     Don't forget to fill out the missing fields in that output! -->
     
 - `transformers` version:
--- a/.github/ISSUE_TEMPLATE/migration.md
+++ b/.github/ISSUE_TEMPLATE/migration.md
@@ -1,8 +1,9 @@
 ---
 name: "\U0001F4DA Migration from pytorch-pretrained-bert or pytorch-transformers"
-about: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers to transformers
+about: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers
+  to transformers
 title: ''
-labels: ''
+labels: Migration
 assignees: ''

 ---
--- a/.github/ISSUE_TEMPLATE/question-help.md
+++ b/.github/ISSUE_TEMPLATE/question-help.md
@@ -26,4 +26,4 @@ assignees: ''

 <!-- You should first ask your question on SO, and only if
     you didn't get an answer ask it here on GitHub. -->
-**A link to original question on Stack Overflow**: 
+**A link to original question on Stack Overflow**:
--- a/.github/workflows/github-push.yml
+++ b/.github/workflows/github-push.yml
@@ -11,9 +11,9 @@ jobs:
      uses: actions/setup-python@v1
      with:
        python-version: 3.7
-    - name: Install dependencies
-      run: |
-        pip install .[tf,torch,quality]
+    # - name: Install dependencies
+    #   run: |
+    #     pip install .[tf,torch,quality]



--- a/.github/workflows/github-torch-hub.yml
+++ b/.github/workflows/github-torch-hub.yml
@@ -0,0 +1,32 @@
+name: Torch hub integration
+
+on: 
+  push:
+    branches:
+      - "*"
+
+jobs:
+  torch_hub_integration:
+    runs-on: ubuntu-latest
+    steps:
+    # no checkout necessary here.
+    - name: Extract branch name
+      run: echo "::set-env name=BRANCH::${GITHUB_REF#refs/heads/}"
+    - name: Check branch name
+      run: echo $BRANCH
+    - name: Set up Python
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.7
+    - name: Install dependencies
+      run: |
+        pip install torch
+        pip install numpy tokenizers filelock requests tqdm regex sentencepiece sacremoses packaging
+
+    - name: Torch hub list
+      run: |
+        python -c "import torch; print(torch.hub.list('huggingface/transformers:$BRANCH'))"
+
+    - name: Torch hub help
+      run: |
+        python -c "import torch; print(torch.hub.help('huggingface/transformers:$BRANCH', 'modelForSequenceClassification'))"
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -1,9 +1,13 @@
 name: Self-hosted runner (push)

 on: 
-  # push:
-  #   branches:
-  #     - master
+  push:
+    branches:
+      - master
+    paths: 
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
  # pull_request:
  repository_dispatch:

@@ -31,8 +35,8 @@ jobs:
    - name: Install dependencies
      run: |
        source .env/bin/activate
-        pip install .[sklearn,tf,torch,testing]
-        pip uninstall -y tensorflow
+        pip install torch
+        pip install .[sklearn,testing]

    - name: Are GPUs recognized by our DL frameworks
      run: |
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -31,13 +31,12 @@ jobs:
    - name: Install dependencies
      run: |
        source .env/bin/activate
-        pip install .[sklearn,tf,torch,testing]
+        pip install .[sklearn,torch,testing]

    - name: Are GPUs recognized by our DL frameworks
      run: |
        source .env/bin/activate
        python -c "import torch; print(torch.cuda.is_available())"
-        python -c "import tensorflow as tf; print(tf.test.is_built_with_cuda(), tf.config.list_physical_devices('GPU'))"

    - name: Run all tests on GPU
      env:
--- a/.gitignore
+++ b/.gitignore
@@ -130,7 +130,10 @@ proc_data

 # examples
 runs
-examples/runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args

 # data
 /data
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -44,9 +44,16 @@ Did not find it? :( So we can act quickly on it, please follow these steps:
 To get the OS and software versions automatically, you can run the following command:

 ```bash
-python transformers-cli env
+transformers-cli env
 ```

+or from the root of the repository the following command:
+
+```bash
+python src/transformers/commands/transformers_cli.py env
+```
+
+
 ### Do you want to implement a new model?

 Awesome! Please provide the following information:
@@ -130,7 +137,6 @@ Follow these steps to start contributing:
   ```bash
   $ pip install -U git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
   ```
-
 5. Develop the features on your branch.

   As you work on the features, you should make sure that the test suite
@@ -199,11 +205,12 @@ Follow these steps to start contributing:
   are useful to avoid duplicated work, and to differentiate it from PRs ready
   to be merged;
 4. Make sure existing tests pass;
-5. Add high-coverage tests. No quality test, no merge. 
+5. Add high-coverage tests. No quality testing = no merge. 
 - If you are adding a new model, make sure that you use `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)`, which triggers the common tests.
 - If you are adding new `@slow` tests, make sure they pass using `RUN_SLOW=1 python -m pytest tests/test_my_new_model.py`. 
+ - If you are adding a new tokenizer, write tests, and make sure `RUN_SLOW=1 python -m pytest tests/test_tokenization_{your_model_name}.py` passes.
 CircleCI does not run them. 
-6. All public methods must have informative docstrings;
+6. All public methods must have informative docstrings that work nicely with sphinx. See `modeling_ctrl.py` for an example.

 ### Tests

--- a/README.md
+++ b/README.md
@@ -19,17 +19,14 @@
 </p>

 <h3 align="center">
-<p>State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
+<p>State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0
 </h3>

-🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet, CTRL...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
+🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet, T5, CTRL...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over thousands of pretrained models in 100+ languages and deep interoperability between PyTorch & TensorFlow 2.0.

 [![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/0)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/0)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/1)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/1)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/2)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/2)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/3)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/3)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/4)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/4)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/5)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/5)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/6)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/6)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/7)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/7)

 ### Features
-
- As easy to use as pytorch-transformers
- As powerful and concise as Keras
 - High performance on NLU and NLG tasks
 - Low barrier to entry for educators and practitioners

@@ -41,7 +38,7 @@ State-of-the-art NLP for everyone
 Lower compute costs, smaller carbon footprint
 - Researchers can share trained models instead of always retraining
 - Practitioners can reduce compute time and production costs
- 10 architectures with over 30 pretrained models, some in more than 100 languages
+- Dozens of architectures with over 1,000 pretrained models, some in more than 100 languages

 Choose the right framework for every part of a model's lifetime
 - Train state-of-the-art models in 3 lines of code
@@ -66,7 +63,7 @@ Choose the right framework for every part of a model's lifetime

 ## Installation

-This repo is tested on Python 3.6+, PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
+This repo is tested on Python 3.6+, PyTorch 1.0.0+ (PyTorch 1.3.1+ for examples) and TensorFlow 2.0.

 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).

@@ -148,24 +145,29 @@ At some point in the future, you'll be able to seamlessly move from pre-training

 🤗 Transformers currently provides the following NLU/NLG architectures:

-1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-3. **[GPT-2](https://blog.openai.com/better-language-models/)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
-9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-11. **[ALBERT](https://github.com/google-research/ALBERT)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-12. **[T5](https://github.com/google-research/text-to-text-transfer-transformer)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-13. **[XLM-RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/xlmr)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+2. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+3. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+4. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+5. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+6. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+7. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+8. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
+9. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+10. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+11. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+12. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+13. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 14. **[MMBT](https://github.com/facebookresearch/mmbt/)** (from Facebook), released together with the paper a [Supervised Multimodal Bitransformers for Classifying Images and Text](https://arxiv.org/pdf/1909.02950.pdf) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
-15. **[FlauBERT](https://github.com/getalp/Flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-16. **[BART](https://github.com/pytorch/fairseq/tree/master/examples/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-17. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
-18. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
+15. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+16. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+17. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+18. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+19. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+20. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+21. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+22. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
+23. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.

 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).

@@ -304,8 +306,9 @@ setup your environment to run the examples.

 The library comprises several example scripts with SOTA performances for NLU and NLG tasks:

- `run_glue.py`: an example fine-tuning Bert, XLNet and XLM on nine different GLUE tasks (*sequence-level classification*)
- `run_squad.py`: an example fine-tuning Bert, XLNet and XLM on the question answering dataset SQuAD 2.0 (*token-level classification*)
+- `run_glue.py`: an example fine-tuning sequence classification models on nine different GLUE tasks (*sequence-level classification*)
+- `run_squad.py`: an example fine-tuning question answering models on the question answering dataset SQuAD 2.0 (*token-level classification*)
+- `run_ner.py`: an example fine-tuning token classification models on named entity recognition (*token-level classification*)
 - `run_generation.py`: an example using GPT, GPT-2, CTRL, Transformer-XL and XLNet for conditional language generation
 - other model-specific examples (see the documentation).

@@ -315,7 +318,7 @@ Here are three quick usage examples for these scripts:

 The [General Language Understanding Evaluation (GLUE) benchmark](https://gluebenchmark.com/) is a collection of nine sentence- or sentence-pair language understanding tasks for evaluating and analyzing natural language understanding systems.

-Before running anyone of these GLUE tasks you should download the
+Before running any of these GLUE tasks you should download the
 [GLUE data](https://gluebenchmark.com/tasks) by running
 [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
 and unpack it to some directory `$GLUE_DIR`.
@@ -330,17 +333,15 @@ pip install -r ./examples/requirements.txt
 export GLUE_DIR=/path/to/glue
 export TASK_NAME=MRPC

-python ./examples/run_glue.py \
-    --model_type bert \
+python ./examples/text-classification/run_glue.py \
    --model_name_or_path bert-base-uncased \
    --task_name $TASK_NAME \
    --do_train \
    --do_eval \
-    --do_lower_case \
    --data_dir $GLUE_DIR/$TASK_NAME \
    --max_seq_length 128 \
-    --per_gpu_eval_batch_size=8   \
-    --per_gpu_train_batch_size=8   \
+    --per_device_eval_batch_size=8   \
+    --per_device_train_batch_size=8   \
    --learning_rate 2e-5 \
    --num_train_epochs 3.0 \
    --output_dir /tmp/$TASK_NAME/
@@ -358,8 +359,7 @@ Parallel training is a simple way to use several GPUs (but is slower and less fl
 ```shell
 export GLUE_DIR=/path/to/glue

-python ./examples/run_glue.py \
-    --model_type xlnet \
+python ./examples/text-classification/run_glue.py \
    --model_name_or_path xlnet-large-cased \
    --do_train  \
    --do_eval   \
@@ -367,8 +367,8 @@ python ./examples/run_glue.py \
    --data_dir=${GLUE_DIR}/STS-B  \
    --output_dir=./proc_data/sts-b-110   \
    --max_seq_length=128   \
-    --per_gpu_eval_batch_size=8   \
-    --per_gpu_train_batch_size=8   \
+    --per_device_eval_batch_size=8   \
+    --per_device_train_batch_size=8   \
    --gradient_accumulation_steps=1 \
    --max_steps=1200  \
    --model_name=xlnet-large-cased   \
@@ -384,17 +384,15 @@ On this machine we thus have a batch size of 32, please increase `gradient_accum
 This example code fine-tunes the Bert Whole Word Masking model on the Microsoft Research Paraphrase Corpus (MRPC) corpus using distributed training on 8 V100 GPUs to reach a F1 > 92.

 ```bash
-python -m torch.distributed.launch --nproc_per_node 8 ./examples/run_glue.py   \
-    --model_type bert \
+python -m torch.distributed.launch --nproc_per_node 8 ./examples/text-classification/run_glue.py   \
    --model_name_or_path bert-large-uncased-whole-word-masking \
    --task_name MRPC \
    --do_train   \
    --do_eval   \
-    --do_lower_case   \
    --data_dir $GLUE_DIR/MRPC/   \
    --max_seq_length 128   \
-    --per_gpu_eval_batch_size=8   \
-    --per_gpu_train_batch_size=8   \
+    --per_device_eval_batch_size=8   \
+    --per_device_train_batch_size=8   \
    --learning_rate 2e-5   \
    --num_train_epochs 3.0  \
    --output_dir /tmp/mrpc_output/ \
@@ -418,12 +416,11 @@ Training with these hyper-parameters gave us the following results:
 This example code fine-tunes BERT on the SQuAD dataset using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:

 ```bash
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \
+python -m torch.distributed.launch --nproc_per_node=8 ./examples/question-answering/run_squad.py \
    --model_type bert \
    --model_name_or_path bert-large-uncased-whole-word-masking \
    --do_train \
    --do_eval \
-    --do_lower_case \
    --train_file $SQUAD_DIR/train-v1.1.json \
    --predict_file $SQUAD_DIR/dev-v1.1.json \
    --learning_rate 3e-5 \
@@ -431,8 +428,8 @@ python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \
    --max_seq_length 384 \
    --doc_stride 128 \
    --output_dir ../models/wwm_uncased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=3   \
-    --per_gpu_train_batch_size=3   \
+    --per_device_eval_batch_size=3   \
+    --per_device_train_batch_size=3   \
 ```

 Training with these hyper-parameters gave us the following results:
@@ -452,7 +449,7 @@ The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-g
 Here is how to run the script with the small version of OpenAI GPT-2 model:

 ```shell
-python ./examples/run_generation.py \
+python ./examples/text-generation/run_generation.py \
    --model_type=gpt2 \
    --length=20 \
    --model_name_or_path=gpt2 \
@@ -460,7 +457,7 @@ python ./examples/run_generation.py \

 and from the Salesforce CTRL model:
 ```shell
-python ./examples/run_generation.py \
+python ./examples/text-generation/run_generation.py \
    --model_type=ctrl \
    --length=20 \
    --model_name_or_path=ctrl \
@@ -537,6 +534,8 @@ You can create `Pipeline` objects for the following down-stream tasks:
 - `text-classification`: Initialize a `TextClassificationPipeline` directly, or see `sentiment-analysis` for an example.
 - `question-answering`: Provided some context and a question refering to the context, it will extract the answer to the question in the context.
 - `fill-mask`: Takes an input sequence containing a masked token (e.g. `<mask>`) and return list of most probable filled sequences, with their probabilities.
+ - `summarization`
+ - `translation_xx_to_yy`

 ```python
 from transformers import pipeline
--- a/docs/README.md
+++ b/docs/README.md
@@ -47,6 +47,8 @@ Once you have setup `sphinx`, you can build the documentation by running the fol
 make html
 ```

+A folder called ``_build/html`` should have been created. You can now open the file ``_build/html/index.html`` in your browser. 
+
 ---
 **NOTE**

@@ -65,3 +67,131 @@ It should build the static app that will be available under `/docs/_build/html`

 Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
 in the source directory. You can then link it to the toc-tree by putting the filename without the extension.
+
+## Writing Documentation - Specification
+
+The `huggingface/transformers` documentation follows the
+[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style. It is
+mostly written in ReStructuredText 
+([Sphinx simple documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html), 
+[Sourceforge complete documentation](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html))
+
+### Adding a new section
+
+A section is a page held in the `Notes` toc-tree on the documentation. Adding a new section is done in two steps:
+
+- Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
+- Link that file in `./source/index.rst` on the correct toc-tree.
+
+### Adding a new model
+
+When adding a new model:
+ 
+- Create a file `xxx.rst` under `./source/model_doc`. 
+- Link that file in `./source/index.rst` on the `model_doc` toc-tree.
+- Write a short overview of the model:
+    - Overview with paper & authors
+    - Paper abstract
+    - Tips and tricks and how to use it best
+- Add the classes that should be linked in the model. This generally includes the configuration, the tokenizer, and
+  every model of that class (the base model, alongside models with additional heads), both in PyTorch and TensorFlow.
+  The order is generally: 
+    - Configuration, 
+    - Tokenizer
+    - PyTorch base model
+    - PyTorch head models
+    - TensorFlow base model
+    - TensorFlow head models
+
+These classes should be added using the RST syntax. Usually as follows:
+```
+XXXConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XXXConfig
+    :members:
+```
+
+This will include every public method of the configuration. If for some reason you wish for a method not to be displayed
+in the documentation, you can do so by specifying which methods should be in the docs:
+
+```
+XXXTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XXXTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+```
+
+### Writing source documentation
+
+Values that should be put in `code` should either be surrounded by double backticks: \`\`like so\`\` or be written as an object
+using the :obj: syntax: :obj:\`like so\`.
+
+When mentionning a class, it is recommended to use the :class: syntax as the mentioned class will be automatically
+linked by Sphinx: :class:\`transformers.XXXClass\`
+
+When mentioning a function, it is recommended to use the :func: syntax as the mentioned method will be automatically
+linked by Sphinx: :func:\`transformers.XXXClass.method\`
+
+Links should be done as so (note the double underscore at the end): \`text for the link <./local-link-or-global-link#loc>\`__
+
+#### Defining arguments in a method
+
+Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation. 
+The argument should be followed by its type, with its shape if it is a tensor, and a line return.
+Another indentation is necessary before writing the description of the argument.
+
+Here's an example showcasing everything so far:
+
+```
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`transformers.AlbertTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+```
+
+#### Writing a multi-line code block 
+
+Multi-line code blocks can be useful for displaying examples. They are done like so:
+
+```
+Example::
+
+    # first line of code
+    # second line
+    # etc
+```
+
+The `Example` string at the beginning can be replaced by anything as long as there are two semicolons following it.
+
+#### Writing a return block
+
+Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation. 
+The first line should be the type of the return, followed by a line return. No need to indent further for the elements
+building the return.
+
+Here's an example for tuple return, comprising several objects:
+
+```
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
+        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+```
+
+Here's an example for a single value return:
+
+```
+    Returns:
+        A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+```
--- a/docs/source/bertology.rst
+++ b/docs/source/bertology.rst
@@ -8,11 +8,11 @@ There is a growing field of study concerned with investigating the inner working
 * Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
 * What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341

-In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
+In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):


 * accessing all the hidden-states of BERT/GPT/GPT-2,
 * accessing all the attention weights for each head of BERT/GPT/GPT-2,
 * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.

-To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
+To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/bertology/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -26,7 +26,7 @@ author = u'huggingface'
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'2.6.0'
+release = u'2.11.0'


 # -- General configuration ---------------------------------------------------
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -12,7 +12,7 @@ A command-line interface is provided to convert original Bert/GPT/GPT-2/Transfor
 BERT
 ^^^^

-You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
+You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_bert_original_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py>`_ script.

 This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).

@@ -33,6 +33,26 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas

 You can download Google's pre-trained models for the conversion `here <https://github.com/google-research/bert#pre-trained-models>`__.

+ALBERT
+^^^^^^
+
+Convert TensorFlow model checkpoints of ALBERT to PyTorch using the `convert_albert_original_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py>`_ script.
+
+The CLI takes as input a TensorFlow checkpoint (three files starting with ``model.ckpt-best``\ ) and the accompanying configuration file (\ ``albert_config.json``\ ), then creates and saves a PyTorch model. To run this conversion you will need to have TensorFlow and PyTorch installed.
+
+Here is an example of the conversion process for the pre-trained ``ALBERT Base`` model:
+
+.. code-block:: shell
+
+   export ALBERT_BASE_DIR=/path/to/albert/albert_base
+
+   transformers-cli convert --model_type albert \
+     --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
+     --config $ALBERT_BASE_DIR/albert_config.json \
+     --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
+
+You can download Google's pre-trained models for the conversion `here <https://github.com/google-research/albert#pre-trained-models>`__.
+
 OpenAI GPT
 ^^^^^^^^^^

--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -143,3 +143,14 @@ positional embeddings.

 Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models
 use other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
+
+
+Feed Forward Chunking
+--------------------------
+
+In transformers two feed forward layers usually follows the self attention layer in each residual attention block. The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (*e.g.* for ``bert-base-uncased``). 
+
+For an input of size ``[batch_size, sequence_length]``, the memory required to store the intermediate feed forward embeddings ``[batch_size, sequence_length, config.intermediate_size]`` can account for a large fraction of the memory use. The authors of `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_ noticed that since the computation is independent of the ``sequence_length`` dimension, it is mathematically equivalent to compute the output embeddings of both feed forward layers ``[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n``  individually and concat them afterward to ``[batch_size, sequence_length, config.hidden_size]`` with ``n = sequence_length``, which trades increased computation time against reduced memory use, but yields a mathematically **equivalent** result.
+
+For models employing the function :func:`~.transformers.apply_chunking_to_forward`, the ``chunk_size`` defines the number of output embeddings that are computed in parallel and thus defines the trade-off between memory and time complexity. 
+If ``chunk_size`` is set to 0, no feed forward chunking is done.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -89,6 +89,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
    :caption: Package Reference

    model_doc/auto
+    model_doc/encoderdecoder
    model_doc/bert
    model_doc/gpt
    model_doc/transformerxl
@@ -103,3 +104,9 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
    model_doc/xlmroberta
    model_doc/flaubert
    model_doc/bart
+    model_doc/t5
+    model_doc/electra
+    model_doc/dialogpt
+    model_doc/reformer
+    model_doc/marian
+    model_doc/longformer
--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@@ -14,6 +14,12 @@ The base class ``PreTrainedModel`` implements the common methods for loading/sav
 .. autoclass:: transformers.PreTrainedModel
    :members:

+``Helper Functions``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.apply_chunking_to_forward
+
+
 ``TFPreTrainedModel``
 ~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/main_classes/pipelines.rst
+++ b/docs/source/main_classes/pipelines.rst
@@ -66,3 +66,9 @@ SummarizationPipeline
 ==========================================

 .. autoclass:: transformers.SummarizationPipeline
+
+
+TextGenerationPipeline
+==========================================
+
+.. autoclass:: transformers.TextGenerationPipeline
--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@@ -54,7 +54,7 @@ Additionally, the following method  can be used to load values from a data file
 Example usage
 ^^^^^^^^^^^^^^^^^^^^^^^^^

-An example using these processors is given in the `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
+An example using these processors is given in the `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/text-classification/run_glue.py>`__ script.


 XNLI
@@ -74,7 +74,7 @@ This library hosts the processor to load the XNLI data:
 Please note that since the gold labels are available on the test set, evaluation is performed on the test set.

 An example using these processors is given in the
-`run_xnli.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_xnli.py>`__ script.
+`run_xnli.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/text-classification/run_xnli.py>`__ script.


 SQuAD
@@ -150,4 +150,4 @@ Example::


 Another example using these processors is given in the
-`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/run_squad.py>`__ script.
+`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/question-answering/run_squad.py>`__ script.
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -1,16 +1,38 @@
 Tokenizer
 ----------------------------------------------------

-The base class ``PreTrainedTokenizer`` implements the common methods for loading/saving a tokenizer either from a local file or directory, or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository).
+A tokenizer is in charge of preparing the inputs for a model. The library comprise tokenizers for all the models. Most of the tokenizers are available in two flavors: a full python implementation and a "Fast" implementation based on the Rust library `tokenizers`. The "Fast" implementations allows (1) a significant speed-up in particular when doing batched tokenization and (2) additional methods to map between the original string (character and words) and the token space (e.g. getting the index of the token comprising a given character or the span of characters corresponding to a given token). Currently no "Fast" implementation is available for the SentencePiece-based tokenizers (for T5, ALBERT, CamemBERT, XLMRoBERTa and XLNet models).

-``PreTrainedTokenizer`` is the main entry point into tokenizers as it also implements the main methods for using all the tokenizers:
+The base classes ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` implements the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and "Fast" tokenizers either from a local file or directory or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository).

- tokenizing, converting tokens to ids and back and encoding/decoding,
+``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` thus implements the main methods for using all the tokenizers:
+
+- tokenizing (spliting strings in sub-word token strings), converting tokens strings to ids and back, and encoding/decoding (i.e. tokenizing + convert to integers),
 - adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...),
- managing special tokens (adding them, assigning them to roles, making sure they are not split during tokenization)
+- managing special tokens like mask, beginning-of-sentence, etc tokens (adding them, assigning them to attributes in the tokenizer for easy access and making sure they are not split during tokenization)
+
+``BatchEncoding`` holds the output of the tokenizer's encoding methods (``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python tokenizer, this class behave just like a standard python dictionary and hold the various model inputs computed by these methodes (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e. backed by HuggingFace tokenizers library), this class provides in addition several advanced alignement methods which can be used to map between the original string (character and words) and the token space (e.g. getting the index of the token comprising a given character or the span of characters corresponding to a given token).

 ``PreTrainedTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PreTrainedTokenizer
    :members:
+
+``PreTrainedTokenizerFast``
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.PreTrainedTokenizerFast
+    :members:
+
+``BatchEncoding``
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BatchEncoding
+    :members:
+
+``SpecialTokensMixin``
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SpecialTokensMixin
+    :members:
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -1,5 +1,18 @@
-# Migrating from pytorch-pretrained-bert
+# Migrating from previous packages

+## Migrating from pytorch-transformers to transformers
+
+Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
+
+### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
+
+To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
+
+If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
+
+If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
+
+## Migrating from pytorch-pretrained-bert

 Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`

@@ -27,7 +40,7 @@ loss = outputs[0]
 # In transformers you can also have access to the logits:
 loss, logits = outputs[:2]

-# And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
+# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
 model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=True)
 outputs = model(input_ids, labels=labels)
 loss, logits, attentions = outputs
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@@ -6,7 +6,7 @@ Overview

 The ALBERT model was proposed in `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_
 by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
-two parameter-reduction techniques to lower memory consumption and increase the trainig speed of BERT:
+two parameter-reduction techniques to lower memory consumption and increase the training speed of BERT:

 - Splitting the embedding matrix into two smaller matrices
 - Using repeating layers split among groups
@@ -30,6 +30,8 @@ Tips:
  similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same
  number of (repeating) layers.

+The original code can be found `here <https://github.com/google-research/ALBERT>`_.
+
 AlbertConfig
 ~~~~~~~~~~~~~~~~~~~~~

@@ -92,3 +94,17 @@ TFAlbertForSequenceClassification

 .. autoclass:: transformers.TFAlbertForSequenceClassification
    :members:
+
+
+TFAlbertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForMultipleChoice
+    :members:
+
+
+TFAlbertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/bart.rst
+++ b/docs/source/model_doc/bart.rst
@@ -1,6 +1,6 @@
 Bart
 ----------------------------------------------------
-**DISCLAIMER:** This model is still a work in progress, if you see something strange,
+**DISCLAIMER:** If you see something strange,
 file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
@sshleifer

@@ -22,7 +22,7 @@ Implementation Notes
 - The forward pass of ``BartModel`` will create decoder inputs (using the helper function ``transformers.modeling_bart._prepare_bart_decoder_inputs``)  if they are not passed. This is different than some other modeling APIs.
 - Model predictions are intended to be identical to the original implementation. This only works, however, if the string you pass to ``fairseq.encode`` starts with a space.
 - ``BartForConditionalGeneration.generate`` should be used for conditional generation tasks like summarization, see the example in that docstrings
- Models that load the ``"bart-large-cnn"`` weights will not have a ``mask_token_id``, or be able to perform mask filling tasks.
+- Models that load the ``"facebook/bart-large-cnn"`` weights will not have a ``mask_token_id``, or be able to perform mask filling tasks.



--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -35,6 +35,8 @@ Tips:
  prediction rather than a token prediction. However, averaging over the sequence may yield better results than using
  the [CLS] token.

+The original code can be found `here <https://github.com/google-research/bert>`_.
+
 BertConfig
 ~~~~~~~~~~~~~~~~~~~~~

@@ -50,6 +52,13 @@ BertTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+BertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertTokenizerFast
+    :members:
+
+
 BertModel
 ~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/camembert.rst
+++ b/docs/source/model_doc/camembert.rst
@@ -22,6 +22,8 @@ Tips:
 - This implementation is the same as RoBERTa. Refer to the `documentation of RoBERTa <./roberta.html>`__ for usage
  examples as well as the information relative to the inputs and outputs.

+The original code can be found `here <https://camembert-model.fr/>`_.
+
 CamembertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@@ -31,6 +31,8 @@ Tips:
  See `reusing the past in generative models <../quickstart.html#using-the-past>`_ for more information on the usage
  of this argument.

+The original code can be found `here <https://github.com/salesforce/ctrl>`_.
+

 CTRLConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/dialogpt.rst
+++ b/docs/source/model_doc/dialogpt.rst
@@ -0,0 +1,39 @@
+DialoGPT
+----------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+DialoGPT was proposed in
+`DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`_
+by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+It's a GPT2 Model trained on 147M conversation-like exchanges extracted from Reddit.
+
+The abstract from the paper is the following:
+
+*We present a large, tunable neural conversational response generation model, DialoGPT (dialogue generative pre-trained transformer). 
+Trained on 147M conversation-like exchanges extracted from Reddit comment chains over a period spanning from 2005 through 2017, DialoGPT extends the Hugging Face PyTorch transformer to attain a performance close to human both in terms of automatic and human evaluation in single-turn dialogue settings.
+We show that conversational systems that leverage DialoGPT generate more relevant, contentful and context-consistent responses than strong baseline systems.
+The pre-trained model and training pipeline are publicly released to facilitate research into neural response generation and the development of more intelligent open-domain dialogue systems.*
+
+Tips:
+
+- DialoGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
+  the right rather than the left.
+- DialoGPT was trained with a causal language modeling (CLM) objective on conversational data and is therefore powerful at response generation in open-domain dialogue systems.
+- DialoGPT enables the user to create a chat bot in just 10 lines of code as shown on `DialoGPT's model card <https://huggingface.co/microsoft/DialoGPT-medium>`_.
+
+Training:
+
+In order to train or fine-tune DialoGPT, one can use causal language modeling training. 
+To cite the official paper: 
+*We follow the OpenAI GPT-2 to model a multiturn dialogue session 
+as a long text and frame the generation task as language modeling. We first
+concatenate all dialog turns within a dialogue session into a long text 
+x_1,..., x_N (N is the sequence length), ended by the end-of-text token.* 
+For more information please confer to the original paper.
+    
+
+DialoGPT's architecture is based on the GPT2 model, so one can refer to GPT2's `docstring <https://huggingface.co/transformers/model_doc/gpt2.html>`_.
+
+The original code can be found `here <https://github.com/microsoft/DialoGPT>`_.
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -27,6 +27,8 @@ Tips:
 - DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
 - DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.

+The original code can be found `here <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
+

 DistilBertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -42,6 +44,13 @@ DistilBertTokenizer
    :members:


+DistilBertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DistilBertTokenizerFast
+    :members:
+
+
 DistilBertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/electra.rst
+++ b/docs/source/model_doc/electra.rst
@@ -0,0 +1,124 @@
+ELECTRA
+----------------------------------------------------
+
+The ELECTRA model was proposed in the paper.
+`ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators <https://openreview.net/pdf?id=r1xMH1BtvB>`__.
+ELECTRA is a new pre-training approach which trains two transformer models: the generator and the discriminator. The
+generator's role is to replace tokens in a sequence, and is therefore trained as a masked language model. The discriminator,
+which is the model we're interested in, tries to identify which tokens were replaced by the generator in the sequence.
+
+The abstract from the paper is the following:
+
+*Masked language modeling (MLM) pre-training methods such as BERT corrupt
+the input by replacing some tokens with [MASK] and then train a model to
+reconstruct the original tokens. While they produce good results when transferred
+to downstream NLP tasks, they generally require large amounts of compute to be
+effective. As an alternative, we propose a more sample-efficient pre-training task
+called replaced token detection. Instead of masking the input, our approach
+corrupts it by replacing some tokens with plausible alternatives sampled from a small
+generator network. Then, instead of training a model that predicts the original
+identities of the corrupted tokens, we train a discriminative model that predicts
+whether each token in the corrupted input was replaced by a generator sample
+or not. Thorough experiments demonstrate this new pre-training task is more
+efficient than MLM because the task is defined over all input tokens rather than
+just the small subset that was masked out. As a result, the contextual representations
+learned by our approach substantially outperform the ones learned by BERT
+given the same model size, data, and compute. The gains are particularly strong
+for small models; for example, we train a model on one GPU for 4 days that
+outperforms GPT (trained using 30x more compute) on the GLUE natural language
+understanding benchmark. Our approach also works well at scale, where it
+performs comparably to RoBERTa and XLNet while using less than 1/4 of their
+compute and outperforms them when using the same amount of compute.*
+
+Tips:
+
+- ELECTRA is the pre-training approach, therefore there is nearly no changes done to the underlying model: BERT. The
+  only change is the separation of the embedding size and the hidden size -> The embedding size is generally smaller,
+  while the hidden size is larger. An additional projection layer (linear) is used to project the embeddings from
+  their embedding size to the hidden size. In the case where the embedding size is the same as the hidden size, no
+  projection layer is used.
+- The ELECTRA checkpoints saved using `Google Research's implementation <https://github.com/google-research/electra>`__
+  contain both the generator and discriminator. The conversion script requires the user to name which model to export
+  into the correct architecture. Once converted to the HuggingFace format, these checkpoints may be loaded into all
+  available ELECTRA models, however. This means that the discriminator may be loaded in the `ElectraForMaskedLM` model,
+  and the generator may be loaded in the `ElectraForPreTraining` model (the classification head will be randomly
+  initialized as it doesn't exist in the generator).
+
+The original code can be found `here <https://github.com/google-research/electra>`_.
+
+
+ElectraConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraConfig
+    :members:
+
+
+ElectraTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraTokenizer
+    :members:
+
+
+ElectraTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraTokenizerFast
+    :members:
+
+
+ElectraModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraModel
+    :members:
+
+
+ElectraForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForPreTraining
+    :members:
+
+
+ElectraForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForMaskedLM
+    :members:
+
+
+ElectraForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForTokenClassification
+    :members:
+
+
+TFElectraModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraModel
+    :members:
+
+
+TFElectraForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForPreTraining
+    :members:
+
+
+TFElectraForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForMaskedLM
+    :members:
+
+
+TFElectraForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForTokenClassification
+    :members:
--- a/docs/source/model_doc/encoderdecoder.rst
+++ b/docs/source/model_doc/encoderdecoder.rst
@@ -0,0 +1,23 @@
+Encoder Decoder Models
+-----------
+
+This class can wrap an encoder model, such as ``BertModel`` and a decoder modeling with a language modeling head, such as ``BertForMaskedLM`` into a encoder-decoder model.
+
+The ``EncoderDecoderModel`` class allows to instantiate a encoder decoder model using the ``from_encoder_decoder_pretrain`` class method taking a pretrained encoder and pretrained decoder model as an input. 
+The ``EncoderDecoderModel`` is saved using the standard ``save_pretrained()`` method and can also again be loaded using the standard ``from_pretrained()`` method. 
+
+An application of this architecture could be *summarization* using two pretrained Bert models as is shown in the paper: `Text Summarization with Pretrained Encoders <https://arxiv.org/abs/1910.13461>`_ by Yang Liu and Mirella Lapata. 
+
+
+``EncoderDecoderConfig``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.EncoderDecoderConfig
+    :members:
+
+
+``EncoderDecoderModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.EncoderDecoderModel
+    :members:
--- a/docs/source/model_doc/flaubert.rst
+++ b/docs/source/model_doc/flaubert.rst
@@ -20,6 +20,8 @@ of the time they outperform other pre-training approaches. Different versions of
 evaluation protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared
 to the research community for further reproducible experiments in French NLP.*

+The original code can be found `here <https://github.com/getalp/Flaubert>`_.
+

 FlaubertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -36,6 +36,9 @@ Tips:
 `Write With Transformer <https://transformer.huggingface.co/doc/gpt>`__ is a webapp created and hosted by
 Hugging Face showcasing the generative capabilities of several models. GPT is one of them.

+The original code can be found `here <https://github.com/openai/finetune-transformer-lm>`_.
+
+
 OpenAIGPTConfig
 ~~~~~~~~~~~~~~~~~~~~~

@@ -50,6 +53,13 @@ OpenAIGPTTokenizer
    :members: save_vocabulary


+OpenAIGPTTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.OpenAIGPTTokenizerFast
+    :members:
+
+
 OpenAIGPTModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -34,6 +34,8 @@ Tips:
 Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
 different sizes: small, medium, large, xl and a distilled version of the small checkpoint: distilgpt-2.

+The original code can be found `here <https://openai.com/blog/better-language-models/>`_.
+

 GPT2Config
 ~~~~~~~~~~~~~~~~~~~~~
@@ -49,6 +51,13 @@ GPT2Tokenizer
    :members: save_vocabulary


+GPT2TokenizerFast
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPT2TokenizerFast
+    :members:
+
+
 GPT2Model
 ~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/longformer.rst
+++ b/docs/source/model_doc/longformer.rst
@@ -0,0 +1,91 @@
+Longformer
+----------------------------------------------------
+**DISCLAIMER:** This model is still a work in progress, if you see something strange,
+file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`_
+
+Overview
+~~~~~
+The Longformer model was presented in `Longformer: The Long-Document Transformer <https://arxiv.org/pdf/2004.05150.pdf>`_ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+Here the abstract: 
+
+*Transformer-based models are unable to process long sequences due to their self-attention operation, which scales quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or longer. Longformer's attention mechanism is a drop-in replacement for the standard self-attention and combines a local windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on WikiHop and TriviaQA.*
+
+The Authors' code can be found `here <https://github.com/allenai/longformer>`_ .
+
+Longformer Self Attention
+~~~~~~~~~~~~~~~~~~~~
+Longformer self attention employs self attention on both a "local" context and a "global" context.
+Most tokens only attend "locally" to each other meaning that each token attends to its :math:`\frac{1}{2} w` previous tokens and :math:`\frac{1}{2} w` succeding tokens with :math:`w` being the window length as defined in `config.attention_window`. Note that `config.attention_window` can be of type ``list`` to define a different :math:`w` for each layer. 
+A selecetd few tokens attend "globally" to all other tokens, as it is conventionally done for all tokens in *e.g.* `BertSelfAttention`.
+
+Note that "locally" and "globally" attending tokens are projected by different query, key and value matrices.
+Also note that every "locally" attending token not only attends to tokens within its window :math:`w`, but also to all "globally" attending tokens so that global attention is *symmetric*.
+
+The user can define which tokens attend "locally" and which tokens attend "globally" by setting the tensor `global_attention_mask` at run-time appropriately. `Longformer` employs the following logic for `global_attention_mask`: `0` - the token attends "locally", `1` - token attends "globally". For more information please also refer to :func:`~transformers.LongformerModel.forward` method.
+
+Using Longformer self attention, the memory and time complexity of the query-key matmul operation, which usually represents the memory and time bottleneck, can be reduced from :math:`\mathcal{O}(n_s \times n_s)` to :math:`\mathcal{O}(n_s \times w)`, with :math:`n_s` being the sequence length and :math:`w` being the average window size. It is assumed that the number of "globally" attending tokens is insignificant as compared to the number of "locally" attending tokens.
+
+For more information, please refer to the official `paper <https://arxiv.org/pdf/2004.05150.pdf>`_ .
+
+
+Training
+~~~~~~~~~~~~~~~~~~~~
+``LongformerForMaskedLM`` is trained the exact same way, ``RobertaForMaskedLM`` is trained and 
+should be used as follows:
+
+::
+
+  input_ids = tokenizer.encode('This is a sentence from [MASK] training data', return_tensors='pt')
+  mlm_labels = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
+
+  loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
+
+
+LongformerConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerConfig
+    :members:
+
+
+LongformerTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerTokenizer
+    :members: 
+
+
+LongformerModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerModel
+    :members:
+
+
+LongformerForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForMaskedLM
+    :members:
+
+
+LongformerForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForQuestionAnswering
+    :members:
+
+
+LongformerForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForMultipleChoice
+    :members:
+
+
+LongformerForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForTokenClassification
+    :members:
+
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -0,0 +1,105 @@
+MarianMT
+----------------------------------------------------
+**DISCLAIMER:** If you see something strange,
+file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
+@sshleifer. Translations should be similar, but not identical to, output in the test set linked to in each model card.
+
+Implementation Notes
+~~~~~~~~~~~~~~~~~~~~
+- each model is about 298 MB on disk, there are 1,000+ models.
+- The list of supported language pairs can be found `here <https://huggingface.co/Helsinki-NLP>`__.
+- The 1,000+ models were originally trained by `Jörg Tiedemann <https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann>`__ using the `Marian <https://marian-nmt.github.io/>`_ C++ library, which supports fast training and translation.
+- All models are transformer encoder-decoders with 6 layers in each component. Each model's performance is documented in a model card.
+- the 80 opus models that require BPE preprocessing are not supported.
+- The modeling code is the same as ``BartForConditionalGeneration`` with a few minor modifications:
+    - static (sinusoid) positional embeddings (``MarianConfig.static_position_embeddings=True``)
+    - a new final_logits_bias (``MarianConfig.add_bias_logits=True``)
+    - no layernorm_embedding (``MarianConfig.normalize_embedding=False``)
+    - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix. (Bart uses <s/>)
+- Code to bulk convert models can be found in ``convert_marian_to_pytorch.py``
+
+Naming
+~~~~~~
+- All  model names use the following format: ``Helsinki-NLP/opus-mt-{src}-{tgt}``
+- The language codes used to name models are inconsistent. Two digit codes can usually be found `here <https://developers.google.com/admin-sdk/directory/v1/languages>`_, three digit codes require googling "language code {code}".
+- Codes formatted like ``es_AR`` are usually ``code_{region}``. That one is spanish documents from Argentina.
+
+
+Multilingual Models
+~~~~~~~~~~~~~~~~~~~~
+
+All  model names use the following format: ``Helsinki-NLP/opus-mt-{src}-{tgt}``:
+    - if ``src`` is in all caps, the model supports multiple input languages, you can figure out which ones by looking at the model card, or the Group Members `mapping <https://gist.github.com/sshleifer/6d20e7761931b08e73c3219027b97b8a>`_ .
+    - if ``tgt`` is in all caps, the model can output multiple languages, and you should specify a language code by prepending the desired output language to the src_text
+    - You can see a tokenizer's supported language codes in ``tokenizer.supported_language_codes``
+
+Example of translating english to many romance languages, using language codes:
+
+.. code-block:: python
+
+    from transformers import MarianMTModel, MarianTokenizer
+    src_text = [
+        '>>fr<< this is a sentence in english that we want to translate to french',
+        '>>pt<< This should go to portuguese',
+        '>>es<< And this to Spanish'
+    ]
+
+    model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
+    tokenizer = MarianTokenizer.from_pretrained(model_name)
+    print(tokenizer.supported_language_codes)
+    model = MarianMTModel.from_pretrained(model_name)
+    translated = model.generate(**tokenizer.prepare_translation_batch(src_text))
+    tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
+    # ["c'est une phrase en anglais que nous voulons traduire en français",
+    # 'Isto deve ir para o português.',
+    # 'Y esto al español']
+
+Sometimes, models were trained on collections of languages that do not resolve to a group. In this case, _ is used as a separator for src or tgt, as in ``'Helsinki-NLP/opus-mt-en_el_es_fi-en_el_es_fi'``. These still require language codes.
+There are many supported regional language codes, like ``>>es_ES<<`` (Spain) and ``>>es_AR<<`` (Argentina), that do not seem to change translations. I have not found these to provide different results than just using ``>>es<<``.
+
+For Example:
+    - ``Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU``: translates from all NORTH_EU languages (see `mapping <https://gist.github.com/sshleifer/6d20e7761931b08e73c3219027b97b8a>`_) to all NORTH_EU languages. Use a special language code like ``>>de<<`` to specify output language.
+    - ``Helsinki-NLP/opus-mt-ROMANCE-en``: translates from many romance languages to english, no codes needed since there is only 1 tgt language.
+
+
+
+.. code-block:: python
+
+    GROUP_MEMBERS = {
+     'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'],
+     'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'],
+     'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
+     'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
+     'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'],
+     'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'],
+     'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv']
+    }
+
+Code to see available pretrained models:
+
+.. code-block:: python
+
+    from transformers.hf_api import HfApi
+    model_list = HfApi().model_list()
+    org = "Helsinki-NLP"
+    model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
+    suffix = [x.split('/')[1] for x in model_ids]
+    multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]
+
+MarianMTModel
+~~~~~~~~~~~~~
+
+Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints.
+Model API is identical to BartForConditionalGeneration.
+Available models are listed at `Model List <https://huggingface.co/models?search=Helsinki-NLP>`__
+This class inherits all functionality from ``BartForConditionalGeneration``, see that page for method signatures.
+
+.. autoclass:: transformers.MarianMTModel
+    :members:
+
+
+MarianTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MarianTokenizer
+    :members: prepare_translation_batch
--- a/docs/source/model_doc/reformer.rst
+++ b/docs/source/model_doc/reformer.rst
@@ -0,0 +1,114 @@
+Reformer
+----------------------------------------------------
+**DISCLAIMER:** This model is still a work in progress, if you see something strange,
+file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`_
+
+Overview
+~~~~~
+The Reformer model was presented in `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451.pdf>`_ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+Here the abstract: 
+
+*Large Transformer models routinely achieve state-of-the-art results on a number of tasks but training these models can be prohibitively costly, especially on long sequences. We introduce two techniques to improve the efficiency of Transformers. For one, we replace dot-product attention by one that uses locality-sensitive hashing, changing its complexity from O(L^2) to O(Llog(L)), where L is the length of the sequence. Furthermore, we use reversible residual layers instead of the standard residuals, which allows storing activations only once in the training process instead of N times, where N is the number of layers. The resulting model, the Reformer, performs on par with Transformer models while being much more memory-efficient and much faster on long sequences.*
+
+The Authors' code can be found `here <https://github.com/google/trax/tree/master/trax/models/reformer>`_ .
+
+Axial Positional Encodings
+~~~~~~~~~~~~~~~~~~~~
+Axial Positional Encodings were first implemented in Google's `trax library <https://github.com/google/trax/blob/4d99ad4965bab1deba227539758d59f0df0fef48/trax/layers/research/position_encodings.py#L29>`_ and developed by the authors of this model's paper. In models that are treating very long input sequences, the conventional position id encodings store an embedings vector of size :math:`d` being the ``config.hidden_size`` for every position :math:`i, \ldots, n_s`, with :math:`n_s` being ``config.max_embedding_size``. *E.g.*, having a sequence length of :math:`n_s = 2^{19} \approx 0.5M` and a ``config.hidden_size`` of :math:`d = 2^{10} \approx 1000` would result in a position encoding matrix:
+
+.. math::
+    X_{i,j}, \text{ with } i \in \left[1,\ldots, d\right] \text{ and } j \in \left[1,\ldots, n_s\right] 
+
+which alone has over 500M parameters to store. Axial positional encodings factorize :math:`X_{i,j}` into two matrices: 
+
+.. math::
+    X^{1}_{i,j}, \text{ with } i \in \left[1,\ldots, d^1\right] \text{ and } j \in \left[1,\ldots, n_s^1\right] 
+
+and 
+
+.. math::
+    X^{2}_{i,j}, \text{ with } i \in \left[1,\ldots, d^2\right] \text{ and } j \in \left[1,\ldots, n_s^2\right] 
+
+with:
+
+.. math::
+    d = d^1 + d^2 \text{ and } n_s = n_s^1 \times n_s^2 .
+
+Therefore the following holds:
+
+.. math::
+    X_{i,j} = \begin{cases}
+                X^{1}_{i, k}, & \text{if }\ i < d^1 \text{ with } k = j \mod n_s^1 \\
+                X^{2}_{i - d^1, l}, & \text{if } i \ge d^1 \text{ with } l = \lfloor\frac{j}{n_s^1}\rfloor
+              \end{cases}
+
+Intuitively, this means that a position embedding vector :math:`x_j \in \mathbb{R}^{d}` is now the composition of two factorized embedding vectors: :math:`x^1_{k, l} + x^2_{l, k}`, where as the ``config.max_embedding_size`` dimension :math:`j` is factorized into :math:`k \text{ and } l`.
+This design ensures that each position embedding vector :math:`x_j` is unique.
+
+Using the above example again, axial position encoding with :math:`d^1 = 2^5, d^2 = 2^5, n_s^1 = 2^9, n_s^2 = 2^{10}` can drastically reduced the number of parameters to :math:`2^{14} + 2^{15} \approx 49000` parameters.
+
+In practice, the parameter ``config.axial_pos_embds_dim`` is set to ``list``:math:`(d^1, d^2)` which sum has to be equal to ``config.hidden_size`` and ``config.axial_pos_shape`` is set to ``list``:math:`(n_s^1, n_s^2)` and which product has to be equal to ``config.max_embedding_size`` which during training has to be equal to the ``sequence length`` of the ``input_ids``.
+
+
+
+LSH Self Attention
+~~~~~~~~~~~~~~~~~~~~
+In Locality sensitive hashing (LSH) self attention the key and query projection weights are tied. Therefore, the key query embedding vectors are also tied.
+LSH self attention uses the locality sensitive 
+hashing mechanism proposed in `Practical and Optimal LSH for Angular Distance <https://arxiv.org/abs/1509.02897>`_ to assign each of the tied key query embedding vectors to one of ``config.num_buckets`` possible buckets. The premise is that the more "similar" key query embedding vectors (in terms of *cosine similarity*) are to each other, the more likely they are assigned to the same bucket. 
+The accuracy of the LSH mechanism can be improved by increasing ``config.num_hashes`` or directly the argument ``num_hashes`` of the forward function so that the output of the LSH self attention better approximates the output of the "normal" full self attention.
+The buckets are then sorted and chunked into query key embedding vector chunks each of length ``config.lsh_chunk_length``. For each chunk, the query embedding vectors attend to its key vectors (which are tied to themselves) and to the key embedding vectors of ``config.lsh_num_chunks_before`` previous neighboring chunks and ``config.lsh_num_chunks_after`` following neighboring chunks.
+For more information, see the `original Paper <https://arxiv.org/abs/2001.04451>`_ or this great `blog post <https://www.pragmatic.ml/reformer-deep-dive/>`_.
+
+Note that ``config.num_buckets`` can also be factorized into a ``list``:math:`(n_{\text{buckets}}^1, n_{\text{buckets}}^2)`. This way instead of assigning the query key embedding vectors to one of :math:`(1,\ldots, n_{\text{buckets}})` they are assigned to one of :math:`(1-1,\ldots, n_{\text{buckets}}^1-1, \ldots, 1-n_{\text{buckets}}^2, \ldots, n_{\text{buckets}}^1-n_{\text{buckets}}^2)`. This is crucial for very long sequences to save memory.
+
+When training a model from scratch, it is recommended to leave ``config.num_buckets=None``, so that depending on the sequence length a good value for ``num_buckets`` is calculated on the fly. This value will then automatically be saved in the config and should be reused for inference.
+
+Using LSH self attention, the memory and time complexity of the query-key matmul operation can be reduced from :math:`\mathcal{O}(n_s \times n_s)` to :math:`\mathcal{O}(n_s \times \log(n_s))`, which usually represents the memory and time bottleneck in a transformer model, with :math:`n_s` being the sequence length.
+
+
+Local Self Attention
+~~~~~~~~~~~~~~~~~~~~
+Local self attention is essentially a "normal" self attention layer with 
+key, query and value projections, but is chunked so that in each chunk of length ``config.local_chunk_length`` the query embedding vectors only attends to the key embedding vectors in its chunk and to the key embedding vectors of ``config.local_num_chunks_before`` previous neighboring chunks and ``config.local_num_chunks_after`` following neighboring chunks.
+
+Using Local self attention, the memory and time complexity of the query-key matmul operation can be reduced from :math:`\mathcal{O}(n_s \times n_s)` to :math:`\mathcal{O}(n_s \times \log(n_s))`, which usually represents the memory and time bottleneck in a transformer model, with :math:`n_s` being the sequence length.
+
+
+Training
+~~~~~~~~~~~~~~~~~~~~
+During training, we must ensure that the sequence length is set to a value that can be divided by the least common multiple of ``config.lsh_chunk_length`` and ``config.local_chunk_length`` and that the parameters of the Axial Positional Encodings are correctly set as described above. Reformer is very memory efficient so that the model can easily be trained on sequences as long as 64000 tokens.
+For training, the ``ReformerModelWithLMHead`` should be used as follows: 
+
+::
+
+  input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
+  loss = model(input_ids, labels=input_ids)[0]
+
+
+ReformerConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerConfig
+    :members:
+
+
+ReformerTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerTokenizer
+    :members: 
+
+
+ReformerModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerModel
+    :members:
+
+
+ReformerModelWithLMHead
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerModelWithLMHead
+    :members:
--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@@ -28,6 +28,9 @@ Tips:
 - RoBERTa doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `</s>`)
 - `Camembert <./camembert.html>`__ is a wrapper around RoBERTa. Refer to this page for usage examples.

+The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_.
+
+
 RobertaConfig
 ~~~~~~~~~~~~~~~~~~~~~

@@ -43,6 +46,13 @@ RobertaTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+RobertaTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RobertaTokenizerFast
+    :members: build_inputs_with_special_tokens
+
+
 RobertaModel
 ~~~~~~~~~~~~~~~~~~~~

@@ -64,6 +74,13 @@ RobertaForSequenceClassification
    :members:


+RobertaForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RobertaForMultipleChoice
+    :members:
+
+
 RobertaForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@@ -0,0 +1,105 @@
+T5
+----------------------------------------------------
+**DISCLAIMER:** This model is still a work in progress, if you see something strange,
+file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`_
+
+Overview
+~~~~~
+The T5 model was presented in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer <https://arxiv.org/pdf/1910.10683.pdf>`_ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu in 
+Here the abstract: 
+
+*Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning has given rise to a diversity of approaches, methodology, and practice. 
+In this paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a text-to-text format. 
+Our systematic study compares pre-training objectives, architectures, unlabeled datasets, transfer approaches, and other factors on dozens of language understanding tasks. 
+By combining the insights from our exploration with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. 
+To facilitate future work on transfer learning for NLP, we release our dataset, pre-trained models, and code.*
+
+The Authors' code can be found `here <https://github.com/google-research/text-to-text-transfer-transformer>`_ .
+
+Training
+~~~~~~~~~~~~~~~~~~~~
+T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher forcing.
+This means that for training we always need an input sequence and a target sequence. 
+The input sequence is fed to the model using ``input_ids``. The target sequence is shifted to the right, *i.e.* prepended by a start-sequence token and fed to the decoder using the `decoder_input_ids`. In teacher-forcing style, the target sequence is then appended by the EOS token and corresponds to the ``lm_labels``. The PAD token is hereby used as the start-sequence token.
+T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
+
+- Unsupervised denoising training
+
+  In this setup spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) 
+  and the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. 
+  Each sentinel token represents a unique mask token for this sentence and should start with ``<extra_id_1>``, ``<extra_id_2>``, ... up to ``<extra_id_100>``. As a default 100 sentinel tokens are available in ``T5Tokenizer``.
+  *E.g.* the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be processed as follows: 
+
+::
+
+  input_ids = tokenizer.encode('The <extra_id_1> walks in <extra_id_2> park', return_tensors='pt')
+  lm_labels = tokenizer.encode('<extra_id_1> cute dog <extra_id_2> the <extra_id_3> </s>', return_tensors='pt')
+  # the forward function automatically creates the correct decoder_input_ids
+  model(input_ids=input_ids, lm_labels=lm_labels)
+
+- Supervised training
+
+  In this setup the input sequence and output sequence are standard sequence to sequence input output mapping.
+  In translation, *e.g.* the input sequence "The house is wonderful." and output sequence "Das Haus ist wunderbar." should 
+  be processed as follows:
+  
+::
+
+  input_ids = tokenizer.encode('translate English to German: The house is wonderful. </s>', return_tensors='pt')
+  lm_labels = tokenizer.encode('Das Haus ist wunderbar. </s>', return_tensors='pt')
+  # the forward function automatically creates the correct decoder_input_ids
+  model(input_ids=input_ids, lm_labels=lm_labels)
+
+Tips
+~~~~~~~~~~~~~~~~~~~~
+- T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised 
+  and supervised tasks and for which each task is converted into a text-to-text format.
+  T5 works well on a variety of tasks out-of-the-box by prepending a different prefix to the input corresponding to each task, e.g.: for translation: *translate English to German: ..., summarize: ...*.
+  For more information about which prefix to use, it is easiest to look into Appendix D of the `paper <https://arxiv.org/pdf/1910.10683.pdf>`_ .
+- For sequence to sequence generation, it is recommended to use ``T5ForConditionalGeneration.generate()``. The method takes care of feeding the encoded input via cross-attention layers to the decoder and auto-regressively generates the decoder output.
+- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
+
+The original code can be found `here <https://github.com/google-research/text-to-text-transfer-transformer>`_.
+
+
+T5Config
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.T5Config
+    :members:
+
+
+T5Tokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.T5Tokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+T5Model
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.T5Model
+    :members:
+
+
+T5ForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.T5ForConditionalGeneration
+    :members:
+
+
+TFT5Model
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFT5Model
+    :members:
+
+
+TFT5ForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFT5ForConditionalGeneration
+    :members:
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -30,6 +30,8 @@ Tips:
  The original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
 - Transformer-XL is one of the few models that has no sequence length limit.

+The original code can be found `here <https://github.com/kimiyoung/transformer-xl>`_.
+

 TransfoXLConfig
 ~~~~~~~~~~~~~~~~~~~~~
@@ -45,6 +47,13 @@ TransfoXLTokenizer
    :members: save_vocabulary


+TransfoXLTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TransfoXLTokenizerFast
+    :members:
+
+
 TransfoXLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -30,6 +30,8 @@ Tips:
 - XLM has multilingual checkpoints which leverage a specific `lang` parameter. Check out the
  `multi-lingual <../multilingual.html>`__ page for more information.

+The original code can be found `here <https://github.com/facebookresearch/XLM/>`_.
+

 XLMConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/xlmroberta.rst
+++ b/docs/source/model_doc/xlmroberta.rst
@@ -28,6 +28,9 @@ Tips:
 - This implementation is the same as RoBERTa. Refer to the `documentation of RoBERTa <./roberta.html>`__ for usage
  examples as well as the information relative to the inputs and outputs.

+The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_.
+
+
 XLMRobertaConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -29,9 +29,11 @@ Tips:
  XLNet is pretrained using only a sub-set of the output tokens as target which are selected
  with the `target_mapping` input.
 - To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and
-  `target_mapping` inputs to control the attention span and outputs (see examples in `examples/run_generation.py`)
+  `target_mapping` inputs to control the attention span and outputs (see examples in `examples/text-generation/run_generation.py`)
 - XLNet is one of the few models that has no sequence length limit.

+The original code can be found `here <https://github.com/zihangdai/xlnet/>`_.
+

 XLNetConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/multilingual.rst
+++ b/docs/source/multilingual.rst
@@ -80,7 +80,7 @@ You can then feed it all as input to your model:
    outputs = model(input_ids, langs=langs)


-The example `run_generation.py <https://github.com/huggingface/transformers/blob/master/examples/run_generation.py>`__
+The example `run_generation.py <https://github.com/huggingface/transformers/blob/master/examples/text-generation/run_generation.py>`__
 can generate text using the CLM checkpoints from XLM, using the language embeddings.

 XLM without Language Embeddings
@@ -104,4 +104,16 @@ BERT has two checkpoints that can be used for multi-lingual tasks:
 - ``bert-base-multilingual-cased`` (Masked language modeling + Next sentence prediction, 104 languages)

 These checkpoints do not require language embeddings at inference time. They should identify the language
-used in the context and infer accordingly.
+used in the context and infer accordingly.
+
+XLM-RoBERTa
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+XLM-RoBERTa was trained on 2.5TB of newly created clean CommonCrawl data in 100 languages. It provides strong
+gains over previously released multi-lingual models like mBERT or XLM on downstream taks like classification,
+sequence labeling and question answering.
+
+Two XLM-RoBERTa checkpoints can be used for multi-lingual tasks:
+
+- ``xlm-roberta-base`` (Masked language modeling, 100 languages)
+- ``xlm-roberta-large`` (Masked language modeling, 100 languages)
--- a/docs/source/notebooks.md
+++ b/docs/source/notebooks.md
@@ -0,0 +1 @@
+../../notebooks/README.md
--- a/docs/source/notebooks.rst
+++ b/docs/source/notebooks.rst
@@ -1,16 +0,0 @@
-Notebooks
-================================================
-
-We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
-
-
-*
-  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
-
-*
-  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
-
-*
-  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
-
-Please follow the instructions given in the notebooks to run and modify them.
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -63,33 +63,33 @@ For a list that includes community-uploaded models, refer to `https://huggingfac
 |                   |                                                            | | Trained on uncased German text by DBMDZ                                                                                             |
 |                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-japanese``                                     | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``cl-tohoku/bert-base-japanese``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece.                                                               |
 |                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-japanese-whole-word-masking``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``cl-tohoku/bert-base-japanese-whole-word-masking``        | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece.                                      |
 |                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-japanese-char``                                | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``cl-tohoku/bert-base-japanese-char``                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text. Text is tokenized into characters.                                                                        |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-japanese-char-whole-word-masking``             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``cl-tohoku/bert-base-japanese-char-whole-word-masking``   | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                               |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-finnish-cased-v1``                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``TurkuNLP/bert-base-finnish-cased-v1``                    | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on cased Finnish text.                                                                                                      |
 |                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-finnish-uncased-v1``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``TurkuNLP/bert-base-finnish-uncased-v1``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on uncased Finnish text.                                                                                                    |
 |                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-dutch-cased``                                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   | ``wietsedv/bert-base-dutch-cased``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on cased Dutch text.                                                                                                        |
 |                   |                                                            | (see `details on wietsedv repository <https://github.com/wietsedv/bertje/>`__).                                                       |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
@@ -259,32 +259,55 @@ For a list that includes community-uploaded models, refer to `https://huggingfac
 |                   | ``xlm-roberta-large``                                      | | ~355M parameters with 24-layers, 1027-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
 |                   |                                                            | | Trained on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                          |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| FlauBERT          | ``flaubert-small-cased``                                   | | 6-layer, 512-hidden, 8-heads, 54M parameters                                                                                        |
+| FlauBERT          | ``flaubert/flaubert_small_cased``                          | | 6-layer, 512-hidden, 8-heads, 54M parameters                                                                                        |
 |                   |                                                            | | FlauBERT small architecture                                                                                                         |
 |                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``flaubert-base-uncased``                                  | | 12-layer, 768-hidden, 12-heads, 137M parameters                                                                                     |
+|                   | ``flaubert/flaubert_base_uncased``                         | | 12-layer, 768-hidden, 12-heads, 137M parameters                                                                                     |
 |                   |                                                            | | FlauBERT base architecture with uncased vocabulary                                                                                  |
 |                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``flaubert-base-cased``                                    | | 12-layer, 768-hidden, 12-heads, 138M parameters                                                                                     |
+|                   | ``flaubert/flaubert_base_cased``                           | | 12-layer, 768-hidden, 12-heads, 138M parameters                                                                                     |
 |                   |                                                            | | FlauBERT base architecture with cased vocabulary                                                                                    |
 |                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``flaubert-large-cased``                                   | | 24-layer, 1024-hidden, 16-heads, 373M parameters                                                                                    |
+|                   | ``flaubert/flaubert_large_cased``                          | | 24-layer, 1024-hidden, 16-heads, 373M parameters                                                                                    |
 |                   |                                                            | | FlauBERT large architecture                                                                                                         |
 |                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Bart              | ``bart-large``                                             | | 12-layer, 1024-hidden, 16-heads, 406M parameters                                                                                    |
+| Bart              | ``facebook/bart-large``                                    | | 24-layer, 1024-hidden, 16-heads, 406M parameters                                                                                    |
 |                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_)                                                       |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bart-large-mnli``                                        | | Adds a 2 layer classification head with 1 million parameters                                                                        |
+|                   | ``facebook/bart-large-mnli``                               | | Adds a 2 layer classification head with 1 million parameters                                                                        |
 |                   |                                                            | | bart-large base architecture with a classification head, finetuned on MNLI                                                          |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bart-large-cnn``                                         | | 12-layer, 1024-hidden, 16-heads, 406M parameters       (same as base)                                                               |
+|                   | ``facebook/bart-large-cnn``                                | | 12-layer, 1024-hidden, 16-heads, 406M parameters       (same as base)                                                               |
 |                   |                                                            | | bart-large base architecture finetuned on cnn summarization task                                                                    |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``facebook/mbart-large-en-ro``                             | | 12-layer, 1024-hidden, 16-heads, 880M parameters                                                                                    |
+|                   |                                                            | | bart-large architecture pretrained on cc25 multilingual data , finetuned on WMT english romanian translation.                       |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| DialoGPT          | ``DialoGPT-small``                                         | | 12-layer, 768-hidden, 12-heads, 124M parameters                                                                                     |
+|                   |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``DialoGPT-medium``                                        | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
+|                   |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``DialoGPT-large``                                         | | 36-layer, 1280-hidden, 20-heads, 774M parameters                                                                                    |
+|                   |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| Reformer          | ``reformer-enwik8``                                        | | 12-layer, 1024-hidden, 8-heads, 149M parameters                                                                                     |
+|                   |                                                            | | Trained on English Wikipedia data - enwik8.                                                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``reformer-crime-and-punishment``                          | | 6-layer, 256-hidden, 2-heads, 3M parameters                                                                                         |
+|                   |                                                            | | Trained on English text: Crime and Punishment novel by Fyodor Dostoyevsky.                                                          |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| MarianMT          | ``Helsinki-NLP/opus-mt-{src}-{tgt}``                       | | 12-layer, 512-hidden, 8-heads, ~74M parameter Machine translation models. Parameter counts vary depending on vocab size.            |
+|                   |                                                            | | (see `model list <https://huggingface.co/Helsinki-NLP>`_)                                                                           |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| Longformer        | ``allenai/longformer-base-4096``                           | | 12-layer, 768-hidden, 12-heads, ~149M parameters                                                                                    |
+|                   |                                                            | | Starting from RoBERTa-base checkpoint, trained on documents of max length 4,096                                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``allenai/longformer-large-4096``                          | | 24-layer, 1024-hidden, 16-heads, ~435M parameters                                                                                   |
+|                   |                                                            | | Starting from RoBERTa-large checkpoint, trained on documents of max length 4,096                                                    |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-
-
-.. <https://huggingface.co/transformers/examples.html>`__
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -8,7 +8,7 @@ The library was designed with two strong goals in mind:

 - be as easy and fast to use as possible:

-  - we strongly limited the number of user-facing abstractions to learn, in fact there are almost no abstractions, just three standard classes required to use each model: configuration, models and tokenizer,
+  - we strongly limited the number of user-facing abstractions to learn, in fact, there are almost no abstractions, just three standard classes required to use each model: configuration, models and tokenizer,
  - all of these classes can be initialized in a simple and unified way from pretrained instances by using a common `from_pretrained()` instantiation method which will take care of downloading (if needed), caching and loading the related class from a pretrained instance supplied in the library or your own saved instance.
  - as a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to extend/build-upon the library, just use regular Python/PyTorch modules and inherit from the base classes of the library to reuse functionalities like model loading/saving.

@@ -31,27 +31,27 @@ A few other goals:

 ## Main concepts

-The library is build around three type of classes for each models:
+The library is build around three types of classes for each model:

- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 8 models architectures currently provided in the library, e.g. `BertModel`
- **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig`. You don't always need to instantiate these your-self, in particular if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model)
- **tokenizer classes** which store the vocabulary for each model and provide methods for encoding/decoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer`
+- **model classes**  e.g., `BertModel` which are 20+ PyTorch models (`torch.nn.Modules`) that work with the pretrained weights provided in the library. In TF2, these are `tf.keras.Model`.
+- **configuration classes** which store all the parameters required to build a model, e.g., `BertConfig`. You don't always need to instantiate these your-self. In particular, if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model)
+- **tokenizer classes** which store the vocabulary for each model and provide methods for encoding/decoding strings in a list of token embeddings indices to be fed to a model, e.g., `BertTokenizer`

 All these classes can be instantiated from pretrained instances and saved locally using two methods:

 - `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
 - `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.

-We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts:
+We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized into two parts:

 - the **MAIN CLASSES** section details the common functionalities/method/attributes of the three main type of classes (configuration, model, tokenizer) plus some optimization related classes provided as utilities for training,
- the **PACKAGE REFERENCE** section details all the variants of each class for each model architectures and in particular the input/output that you should expect when calling each of them.
+- the **PACKAGE REFERENCE** section details all the variants of each class for each model architectures and, in particular, the input/output that you should expect when calling each of them.

 ## Quick tour: Usage

 Here are two examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.

-See full API reference for examples for each model class.
+See the full API reference for examples of each model class.

 ### BERT example

@@ -191,7 +191,7 @@ Examples for each model class of each model architecture (Bert, GPT, GPT-2, Tran

 #### Using the past

-GPT-2 as well as some other models (GPT, XLNet, Transfo-XL, CTRL) make use of a `past` or `mems` attribute which can be used to prevent re-computing the key/value pairs when using sequential decoding. It is useful when generating sequences as a big part of the attention mechanism benefits from previous computations.
+GPT-2, as well as some other models (GPT, XLNet, Transfo-XL, CTRL), make use of a `past` or `mems` attribute which can be used to prevent re-computing the key/value pairs when using sequential decoding. It is useful when generating sequences as a big part of the attention mechanism benefits from previous computations.

 Here is a fully-working example using the `past` with `GPT2LMHeadModel` and argmax decoding (which should only be used as an example, as argmax decoding introduces a lot of repetition):

--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -58,14 +58,14 @@ where

 ``Uncased`` means that the text has been lowercased before WordPiece tokenization, e.g., ``John Smith`` becomes ``john smith``. The Uncased model also strips out any accent markers. ``Cased`` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the `Multilingual README <https://github.com/google-research/bert/blob/master/multilingual.md>`__ or the original TensorFlow repository.

-When using an ``uncased model``\ , make sure to pass ``--do_lower_case`` to the example training scripts (or pass ``do_lower_case=True`` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).
+When using an ``uncased model``\ , make sure your tokenizer has ``do_lower_case=True`` (either in its configuration, or passed as an additional parameter).

 Examples:

 .. code-block:: python

   # BERT
-   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
+   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_basic_tokenize=True)
   model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

   # OpenAI GPT
@@ -140,13 +140,13 @@ Here is the recommended way of saving the model, configuration and vocabulary to

   torch.save(model_to_save.state_dict(), output_model_file)
   model_to_save.config.to_json_file(output_config_file)
-   tokenizer.save_vocabulary(output_dir)
+   tokenizer.save_pretrained(output_dir)

   # Step 2: Re-load the saved model and vocabulary

   # Example for a Bert model
   model = BertForQuestionAnswering.from_pretrained(output_dir)
-   tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case)  # Add specific options if needed
+   tokenizer = BertTokenizer.from_pretrained(output_dir)  # Add specific options if needed
   # Example for a GPT model
   model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
   tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -44,8 +44,8 @@ Sequence Classification
 Sequence classification is the task of classifying sequences according to a given number of classes. An example
 of sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune
 a model on a GLUE sequence classification task, you may leverage the
-`run_glue.py <https://github.com/huggingface/transformers/tree/master/examples/run_glue.py>`_ or
-`run_tf_glue.py <https://github.com/huggingface/transformers/tree/master/examples/run_tf_glue.py>`_ scripts.
+`run_glue.py <https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_glue.py>`_ or
+`run_tf_glue.py <https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_tf_glue.py>`_ scripts.

 Here is an example using the pipelines do to sentiment analysis: identifying if a sequence is positive or negative.
 It leverages a fine-tuned model on sst2, which is a GLUE task.
@@ -404,52 +404,150 @@ Causal language modeling is the task of predicting the token following a sequenc
 model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting
 for generation tasks.

-There is currently no pipeline to do causal language modeling/generation.
+Usually, the next token is predicted by sampling from the logits of the last hidden state the model produces from the input sequence.

-Here is an example using the tokenizer and model. leveraging the :func:`~transformers.PreTrainedModel.generate` method
-to generate the tokens following the initial sequence in PyTorch, and creating a simple loop in TensorFlow.
+Here is an example using the tokenizer and model and leveraging the :func:`~transformers.PreTrainedModel.top_k_top_p_filtering` method to sample the next token following an input sequence of tokens.
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelWithLMHead, AutoTokenizer, top_k_top_p_filtering
+    import torch
+    from torch.nn import functional as F
+
+
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    model = AutoModelWithLMHead.from_pretrained("gpt2")
+
+    sequence = f"Hugging Face is based in DUMBO, New York City, and "
+
+    input_ids = tokenizer.encode(sequence, return_tensors="pt")
+
+    # get logits of last hidden state
+    next_token_logits = model(input_ids)[0][:, -1, :]
+
+    # filter
+    filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
+
+    # sample
+    probs = F.softmax(filtered_next_token_logits, dim=-1)
+    next_token = torch.multinomial(probs, num_samples=1)
+
+    generated = torch.cat([input_ids, next_token], dim=-1)
+
+    resulting_string = tokenizer.decode(generated.tolist()[0])
+    print(resulting_string)
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelWithLMHead, AutoTokenizer, tf_top_k_top_p_filtering
+    import tensorflow as tf
+
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    model = TFAutoModelWithLMHead.from_pretrained("gpt2")
+
+    sequence = f"Hugging Face is based in DUMBO, New York City, and "
+
+    input_ids = tokenizer.encode(sequence, return_tensors="tf")
+
+    # get logits of last hidden state
+    next_token_logits = model(input_ids)[0][:, -1, :]
+
+    # filter
+    filtered_next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
+
+    # sample
+    next_token = tf.random.categorical(filtered_next_token_logits, dtype=tf.int32, num_samples=1)
+
+    generated = tf.concat([input_ids, next_token], axis=1)
+
+    resulting_string = tokenizer.decode(generated.numpy().tolist()[0])
+    print(resulting_string)
+
+
+This outputs a (hopefully) coherent next token following the original sequence, which is in our case is the word *has*:
+
+::
+
+    Hugging Face is based in DUMBO, New York City, and has
+
+In the next section, we show how this functionality is leveraged in :func:`~transformers.PreTrainedModel.generate` to generate multiple tokens up to a user-defined length.
+
+Text Generation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In text generation (*a.k.a* *open-ended text generation*) the goal is to create a coherent portion of text that is a continuation from the given context. As an example, is it shown how *GPT-2* can be used in pipelines to generate text. As a default all models apply *Top-K* sampling when used in pipelines as configured in their respective configurations (see `gpt-2 config <https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json>`_ for example).
+
+::
+
+    from transformers import pipeline
+
+    text_generator = pipeline("text-generation")
+    print(text_generator("As far as I am concerned, I will", max_length=50))
+
+
+Here the model generates a random text with a total maximal length of *50* tokens from context *"As far as I am concerned, I will"*.
+The default arguments of ``PreTrainedModel.generate()`` can directly be overriden in the pipeline as is shown above for the argument ``max_length``.
+
+Here is an example for text generation using XLNet and its tokenzier. 

 ::

    ## PYTORCH CODE
    from transformers import AutoModelWithLMHead, AutoTokenizer

-    tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    model = AutoModelWithLMHead.from_pretrained("gpt2")
+    model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased")
+    tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

-    sequence = f"Hugging Face is based in DUMBO, New York City, and is"
+    # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
+    PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
+    (except for Alexei and Maria) are discovered.
+    The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+    remainder of the story. 1883 Western Siberia,
+    a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+    Rasputin has a vision and denounces one of the men as a horse thief. Although his
+    father initially slaps him for making such an accusation, Rasputin watches as the
+    man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+    with people, even a bishop, begging for his blessing. <eod> </s> <eos>""" 

-    input = tokenizer.encode(sequence, return_tensors="pt")
-    generated = model.generate(input, max_length=50)
+    prompt = "Today the weather is really nice and I am planning on "
+    inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")
+    
+    prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
+    generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]

-    resulting_string = tokenizer.decode(generated.tolist()[0])
-    print(resulting_string)
+    print(generated)
    ## TENSORFLOW CODE
    from transformers import TFAutoModelWithLMHead, AutoTokenizer
-    import tensorflow as tf

-    tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    model = TFAutoModelWithLMHead.from_pretrained("gpt2")
+    model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased")
+    tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

-    sequence = f"Hugging Face is based in DUMBO, New York City, and is"
-    generated = tokenizer.encode(sequence)
+    # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
+    PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
+    (except for Alexei and Maria) are discovered.
+    The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+    remainder of the story. 1883 Western Siberia,
+    a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+    Rasputin has a vision and denounces one of the men as a horse thief. Although his
+    father initially slaps him for making such an accusation, Rasputin watches as the
+    man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+    with people, even a bishop, begging for his blessing. <eod> </s> <eos>""" 

-    for i in range(50):
-        predictions = model(tf.constant([generated]))[0]
-        token = tf.argmax(predictions[0], axis=1)[-1].numpy()
-        generated += [token]
+    prompt = "Today the weather is really nice and I am planning on "
+    inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf")

-    resulting_string = tokenizer.decode(generated)
-    print(resulting_string)
+    prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
+    generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]

+    print(generated)

-This outputs a (hopefully) coherent string from the original sequence, as the
-:func:`~transformers.PreTrainedModel.generate` samples from a top_p/tok_k distribution:
+Text generation is currently possible with *GPT-2*, *OpenAi-GPT*, *CTRL*, *XLNet*, *Transfo-XL* and *Reformer* in PyTorch and for most models in Tensorflow as well. As can be seen in the example above *XLNet* and *Transfo-xl* often need to be padded to work well.
+GPT-2 is usually a good choice for *open-ended text generation* because it was trained on millions on webpages with a causal language modeling objective.

-::
-
-    Hugging Face is based in DUMBO, New York City, and is a live-action TV series based on the novel by John
-    Carpenter, and its producers, David Kustlin and Steve Pichar. The film is directed by!
+For more information on how to apply different decoding strategies for text generation, please also refer to our generation blog post `here <https://huggingface.co/blog/how-to-generate>`_.


 Named Entity Recognition
@@ -594,4 +692,138 @@ following array should be the output:

 ::

-    [('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]
+    [('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]   
+Summarization
+----------------------------------------------------
+
+Summarization is the task of summarizing a text / an article into a shorter text.
+
+An example of a summarization dataset is the CNN / Daily Mail dataset, which consists of long news articles and was created for the task of summarization.
+If you would like to fine-tune a model on a summarization task, you may leverage the ``examples/summarization/bart/run_train.sh`` (leveraging pytorch-lightning) script.
+
+Here is an example using the pipelines do to summarization. 
+It leverages a Bart model that was fine-tuned on the CNN / Daily Mail data set.
+
+::
+
+    from transformers import pipeline
+
+    summarizer = pipeline("summarization")
+
+    ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. 
+    A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. 
+    Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. 
+    In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. 
+    Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 
+    2010 marriage license application, according to court documents. 
+    Prosecutors said the marriages were part of an immigration scam. 
+    On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. 
+    After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective 
+    Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. 
+    All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. 
+    Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages. 
+    Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. 
+    The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s 
+    Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. 
+    Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. 
+    If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.
+    """
+    
+    print(summarizer(ARTICLE, max_length=130, min_length=30))
+
+Because the summarization pipeline depends on the ``PretrainedModel.generate()`` method, we can override the default arguments 
+of ``PretrainedModel.generate()`` directly in the pipeline as is shown for ``max_length`` and ``min_length`` above.
+This outputs the following summary:
+
+::
+
+  Liana Barrientos has been married 10 times, sometimes within two weeks of each other. Prosecutors say the marriages were part of an immigration scam. She pleaded not guilty at State Supreme Court in the Bronx on Friday.
+  
+Here is an example doing summarization using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder model, such as ``Bart`` or ``T5``.
+- Define the article that should be summarizaed.
+- Leverage the ``PretrainedModel.generate()`` method.
+- Add the T5 specific prefix "summarize: ".
+
+Here Google`s T5 model is used that was only pre-trained on a multi-task mixed data set (including CNN / Daily Mail), but nevertheless yields very good results.
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelWithLMHead, AutoTokenizer
+
+    model = AutoModelWithLMHead.from_pretrained("t5-base")
+    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    # T5 uses a max_length of 512 so we cut the article to 512 tokens.
+    inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="pt", max_length=512)
+    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
+    print(outputs)
+    
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelWithLMHead, AutoTokenizer
+
+    model = TFAutoModelWithLMHead.from_pretrained("t5-base")
+    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    # T5 uses a max_length of 512 so we cut the article to 512 tokens.
+    inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="tf", max_length=512)
+    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
+    print(outputs)  
+Translation
+----------------------------------------------------
+
+Translation is the task of translating a text from one language to another.
+
+An example of a translation dataset is the WMT English to German dataset, which has English sentences as the input data 
+and German sentences as the target data.
+
+Here is an example using the pipelines do to translation. 
+It leverages a T5 model that was only pre-trained on a multi-task mixture dataset (including WMT), but yields impressive 
+translation results nevertheless.
+
+::
+
+    from transformers import pipeline
+
+    translator = pipeline("translation_en_to_de")
+    print(translator("Hugging Face is a technology company based in New York and Paris", max_length=40))
+
+Because the translation pipeline depends on the ``PretrainedModel.generate()`` method, we can override the default arguments 
+of ``PretrainedModel.generate()`` directly in the pipeline as is shown for ``max_length`` above.
+This outputs the following translation into German:
+
+::
+
+  Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.
+  
+Here is an example doing translation using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder model, such as ``Bart`` or ``T5``.
+- Define the article that should be summarizaed.
+- Leverage the ``PretrainedModel.generate()`` method.
+- Add the T5 specific prefix "translate English to German: "
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelWithLMHead, AutoTokenizer
+
+    model = AutoModelWithLMHead.from_pretrained("t5-base")
+    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="pt")
+    outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
+
+    print(outputs)
+    
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelWithLMHead, AutoTokenizer
+
+    model = TFAutoModelWithLMHead.from_pretrained("t5-base")
+    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="tf")
+    outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
+
+    print(outputs)
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,10 +1,41 @@
-# Examples
+## Examples

-In this section a few examples are put together. All of these examples work for several models, making use of the very
-similar API between the different models.
+Version 2.9 of `transformers` introduces a new [`Trainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py) class for PyTorch, and its equivalent [`TFTrainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer_tf.py) for TF 2.
+Running the examples requires PyTorch 1.3.1+ or TensorFlow 2.0+.
+
+Here is the list of all our examples:
+- **grouped by task** (all official examples work for multiple models)
+- with information on whether they are **built on top of `Trainer`/`TFTrainer`** (if not, they still work, they might just lack some features),
+- whether they also include examples for **`pytorch-lightning`**, which is a great fully-featured, general-purpose training library for PyTorch,
+- links to **Colab notebooks** to walk through the scripts and run them easily,
+- links to **Cloud deployments** to be able to deploy large-scale trainings in the Cloud with little to no setup.
+
+This is still a work-in-progress – in particular documentation is still sparse – so please **contribute improvements/pull requests.**
+
+
+# The Big Table of Tasks
+
+| Task | Example datasets | Trainer support | TFTrainer support | pytorch-lightning | Colab
+|---|---|:---:|:---:|:---:|:---:|
+| [**`language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/language-modeling)       | Raw text        | ✅ | -  | -  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)
+| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/text-classification)   | GLUE, XNLI      | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/trainer/01_text_classification.ipynb)
+| [**`token-classification`**](https://github.com/huggingface/transformers/tree/master/examples/token-classification) | CoNLL NER       | ✅ | ✅ | ✅ | -
+| [**`multiple-choice`**](https://github.com/huggingface/transformers/tree/master/examples/multiple-choice)           | SWAG, RACE, ARC | ✅ | ✅ | -  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ViktorAlm/notebooks/blob/master/MPC_GPU_Demo_for_TF_and_PT.ipynb)
+| [**`question-answering`**](https://github.com/huggingface/transformers/tree/master/examples/question-answering)     | SQuAD           | -  | ✅ | -  | -
+| [**`text-generation`**](https://github.com/huggingface/transformers/tree/master/examples/text-generation)     | -           | -  | - | -  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)
+| [**`distillation`**](https://github.com/huggingface/transformers/tree/master/examples/distillation)       | All               | -  | -  | -  | -
+| [**`summarization`**](https://github.com/huggingface/transformers/tree/master/examples/summarization)     | CNN/Daily Mail    | -  | -  | -  | -
+| [**`translation`**](https://github.com/huggingface/transformers/tree/master/examples/translation)         | WMT               | -  | -  | -  | -
+| [**`bertology`**](https://github.com/huggingface/transformers/tree/master/examples/bertology)             | -                 | -  | -  | -  | -
+| [**`adversarial`**](https://github.com/huggingface/transformers/tree/master/examples/adversarial)         | HANS              | -  | -  | -  | -
+
+
+<br>
+
+## Important note

 **Important**
-To run the latest versions of the examples, you have to install from source and install some specific requirements for the examples.
+To make sure you can successfully run the latest versions of the example scripts, you have to install the library from source and install some example-specific requirements.
 Execute the following steps in a new virtual environment:

 ```bash
@@ -14,608 +45,36 @@ pip install .
 pip install -r ./examples/requirements.txt
 ```

-| Section                    | Description                                                                                                                                                |
-|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------
-| [TensorFlow 2.0 models on GLUE](#TensorFlow-2.0-Bert-models-on-GLUE) | Examples running BERT TensorFlow 2.0 model on the GLUE tasks. |
-| [Language Model training](#language-model-training) | Fine-tuning (or training from scratch) the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. |
-| [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet. |
-| [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision. |
-| [SQuAD](#squad) | Using BERT/RoBERTa/XLNet/XLM for question answering, examples with distributed training. |
-| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. |
-| [Named Entity Recognition](https://github.com/huggingface/transformers/tree/master/examples/ner) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training. |
-| [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
-| [Adversarial evaluation of model performances](#adversarial-evaluation-of-model-performances) | Testing a model with adversarial evaluation of natural language inference on the Heuristic Analysis for NLI Systems (HANS) dataset (McCoy et al., 2019.) |
+## One-click Deploy to Cloud (wip)

-## TensorFlow 2.0 Bert models on GLUE
+#### Azure

-Based on the script [`run_tf_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_glue.py).
+[![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2FAzure%2Fazure-quickstart-templates%2Fmaster%2F101-storage-account-create%2Fazuredeploy.json)

-Fine-tuning the library TensorFlow 2.0 Bert model for sequence classification on the  MRPC task of the GLUE benchmark: [General Language Understanding Evaluation](https://gluebenchmark.com/).
+## Running on TPUs

-This script has an option for mixed precision (Automatic Mixed Precision / AMP) to run models on Tensor Cores (NVIDIA Volta/Turing GPUs) and future hardware and an option for XLA, which uses the XLA compiler to reduce model runtime.
-Options are toggled using `USE_XLA` or `USE_AMP` variables in the script.
-These options and the below benchmark are provided by @tlkh.
+When using Tensorflow, TPUs are supported out of the box as a `tf.distribute.Strategy`.

-Quick benchmarks from the script (no other modifications):
+When using PyTorch, we support TPUs thanks to `pytorch/xla`. For more context and information on how to setup your TPU environment refer to Google's documentation and to the
+very detailed [pytorch/xla README](https://github.com/pytorch/xla/blob/master/README.md).

-| GPU    | Mode | Time (2nd epoch) | Val Acc (3 runs) |
-| --------- | -------- | ----------------------- | ----------------------|
-| Titan V | FP32 | 41s | 0.8438/0.8281/0.8333 |
-| Titan V | AMP | 26s | 0.8281/0.8568/0.8411 |
-| V100    | FP32 | 35s | 0.8646/0.8359/0.8464 |
-| V100    | AMP | 22s | 0.8646/0.8385/0.8411 |
-| 1080 Ti | FP32 | 55s | - |
+In this repo, we provide a very simple launcher script named [xla_spawn.py](https://github.com/huggingface/transformers/tree/master/examples/xla_spawn.py) that lets you run our example scripts on multiple TPU cores without any boilerplate.
+Just pass a `--num_cores` flag to this script, then your regular training script with its arguments (this is similar to the `torch.distributed.launch` helper for torch.distributed).

-Mixed precision (AMP) reduces the training time considerably for the same hardware and hyper-parameters (same batch size was used).
-
-## Language model training
-
-Based on the script [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/run_language_modeling.py).
-
-Fine-tuning (or training from scratch) the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT 
-to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa 
-are fine-tuned using a masked language modeling (MLM) loss.
-
-Before running the following example, you should get a file that contains text on which the language model will be
-trained or fine-tuned. A good example of such text is the [WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
-
-We will refer to two different files: `$TRAIN_FILE`, which contains text for training, and `$TEST_FILE`, which contains
-text that will be used for evaluation.
-
-### GPT-2/GPT and causal language modeling
-
-The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before
-the tokenization). The loss here is that of causal language modeling.
+For example for `run_glue`:

 ```bash
-export TRAIN_FILE=/path/to/dataset/wiki.train.raw
-export TEST_FILE=/path/to/dataset/wiki.test.raw
-
-python run_language_modeling.py \
-    --output_dir=output \
-    --model_type=gpt2 \
-    --model_name_or_path=gpt2 \
-    --do_train \
-    --train_data_file=$TRAIN_FILE \
-    --do_eval \
-    --eval_data_file=$TEST_FILE
+python examples/xla_spawn.py --num_cores 8 \
+	examples/text-classification/run_glue.py
+	--model_name_or_path bert-base-cased \
+	--task_name mnli \
+	--data_dir ./data/glue_data/MNLI \
+	--output_dir ./models/tpu \
+	--overwrite_output_dir \
+	--do_train \
+	--do_eval \
+	--num_train_epochs 1 \
+	--save_steps 20000
 ```

-This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run. It reaches
-a score of ~20 perplexity once fine-tuned on the dataset.
-
-### RoBERTa/BERT and masked language modeling
-
-The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different
-as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their
-pre-training: masked language modeling.
-
-In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may, therefore, converge
-slightly slower (over-fitting takes more epochs).
-
-We use the `--mlm` flag so that the script may change its loss function.
-
-```bash
-export TRAIN_FILE=/path/to/dataset/wiki.train.raw
-export TEST_FILE=/path/to/dataset/wiki.test.raw
-
-python run_language_modeling.py \
-    --output_dir=output \
-    --model_type=roberta \
-    --model_name_or_path=roberta-base \
-    --do_train \
-    --train_data_file=$TRAIN_FILE \
-    --do_eval \
-    --eval_data_file=$TEST_FILE \
-    --mlm
-```
-
-## Language generation
-
-Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py).
-
-Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL.
-A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
-can try out the different models available in the library.
-
-Example usage:
-
-```bash
-python run_generation.py \
-    --model_type=gpt2 \
-    --model_name_or_path=gpt2
-```
-
-## GLUE
-
-Based on the script [`run_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py).
-
-Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding
-Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa.
-
-GLUE is made up of a total of 9 different tasks. We get the following results on the dev set of the benchmark with an
-uncased  BERT base model (the checkpoint `bert-base-uncased`). All experiments ran single V100 GPUs with a total train
-batch sizes between 16 and 64. Some of these tasks have a small dataset and training can lead to high variance in the results
-between different runs. We report the median on 5 runs (with different seeds) for each of the metrics.
-
-| Task  | Metric                       | Result      |
-|-------|------------------------------|-------------|
-| CoLA  | Matthew's corr               | 49.23       |
-| SST-2 | Accuracy                     | 91.97       |
-| MRPC  | F1/Accuracy                  | 89.47/85.29 |
-| STS-B | Person/Spearman corr.        | 83.95/83.70 |
-| QQP   | Accuracy/F1                  | 88.40/84.31 |
-| MNLI  | Matched acc./Mismatched acc. | 80.61/81.08 |
-| QNLI  | Accuracy                     | 87.46       |
-| RTE   | Accuracy                     | 61.73       |
-| WNLI  | Accuracy                     | 45.07       |
-
-Some of these results are significantly different from the ones reported on the test set
-of GLUE benchmark on the website. For QQP and WNLI, please refer to [FAQ #12](https://gluebenchmark.com/faq) on the webite.
-
-Before running any one of these GLUE tasks you should download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
-
-```bash
-export GLUE_DIR=/path/to/glue
-export TASK_NAME=MRPC
-
-python run_glue.py \
-  --model_type bert \
-  --model_name_or_path bert-base-cased \
-  --task_name $TASK_NAME \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $GLUE_DIR/$TASK_NAME \
-  --max_seq_length 128 \
-  --per_gpu_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --output_dir /tmp/$TASK_NAME/
-```
-
-where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
-
-The dev set results will be present within the text file `eval_results.txt` in the specified output_dir.
-In case of MNLI, since there are two separate dev sets (matched and mismatched), there will be a separate
-output folder called `/tmp/MNLI-MM/` in addition to `/tmp/MNLI/`.
-
-The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI,
-CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being
-said, there shouldn’t be any issues in running half-precision training with the remaining GLUE tasks as well,
-since the data processor for each task inherits from the base class DataProcessor.
-
-### MRPC
-
-#### Fine-tuning example
-
-The following examples fine-tune BERT on the Microsoft Research Paraphrase Corpus (MRPC) corpus and runs in less
-than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
-
-Before running any one of these GLUE tasks you should download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
-
-```bash
-export GLUE_DIR=/path/to/glue
-
-python run_glue.py \
-  --model_type bert \
-  --model_name_or_path bert-base-cased \
-  --task_name MRPC \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $GLUE_DIR/MRPC/ \
-  --max_seq_length 128 \
-  --per_gpu_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --output_dir /tmp/mrpc_output/
-```
-
-Our test ran on a few seeds with [the original implementation hyper-
-parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation
-results between 84% and 88%.
-
-#### Using Apex and mixed-precision
-
-Using Apex and 16 bit precision, the fine-tuning on MRPC only takes 27 seconds. First install
-[apex](https://github.com/NVIDIA/apex), then run the following example:
-
-```bash
-export GLUE_DIR=/path/to/glue
-
-python run_glue.py \
-  --model_type bert \
-  --model_name_or_path bert-base-cased \
-  --task_name MRPC \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --data_dir $GLUE_DIR/MRPC/ \
-  --max_seq_length 128 \
-  --per_gpu_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --output_dir /tmp/mrpc_output/ \
-  --fp16
-```
-
-#### Distributed training
-
-Here is an example using distributed training on 8 V100 GPUs. The model used is the BERT whole-word-masking and it
-reaches F1 > 92 on MRPC.
-
-```bash
-export GLUE_DIR=/path/to/glue
-
-python -m torch.distributed.launch \
-    --nproc_per_node 8 run_glue.py \
-    --model_type bert \
-    --model_name_or_path bert-base-cased \
-    --task_name MRPC \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --data_dir $GLUE_DIR/MRPC/ \
-    --max_seq_length 128 \
-    --per_gpu_train_batch_size 8 \
-    --learning_rate 2e-5 \
-    --num_train_epochs 3.0 \
-    --output_dir /tmp/mrpc_output/
-```
-
-Training with these hyper-parameters gave us the following results:
-
-```bash
-acc = 0.8823529411764706
-acc_and_f1 = 0.901702786377709
-eval_loss = 0.3418912578906332
-f1 = 0.9210526315789473
-global_step = 174
-loss = 0.07231863956341798
-```
-
-### MNLI
-
-The following example uses the BERT-large, uncased, whole-word-masking model and fine-tunes it on the MNLI task.
-
-```bash
-export GLUE_DIR=/path/to/glue
-
-python -m torch.distributed.launch \
-    --nproc_per_node 8 run_glue.py \
-    --model_type bert \
-    --model_name_or_path bert-base-cased \
-    --task_name mnli \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --data_dir $GLUE_DIR/MNLI/ \
-    --max_seq_length 128 \
-    --per_gpu_train_batch_size 8 \
-    --learning_rate 2e-5 \
-    --num_train_epochs 3.0 \
-    --output_dir output_dir \
-```
-
-The results  are the following:
-
-```bash
-***** Eval results *****
-  acc = 0.8679706601466992
-  eval_loss = 0.4911287787382479
-  global_step = 18408
-  loss = 0.04755385363816904
-
-***** Eval results *****
-  acc = 0.8747965825874695
-  eval_loss = 0.45516540421714036
-  global_step = 18408
-  loss = 0.04755385363816904
-```
-
-## Multiple Choice
-
-Based on the script [`run_multiple_choice.py`]().
-
-#### Fine-tuning on SWAG
-Download [swag](https://github.com/rowanz/swagaf/tree/master/data) data
-
-```bash
-#training on 4 tesla V100(16GB) GPUS
-export SWAG_DIR=/path/to/swag_data_dir
-python ./examples/run_multiple_choice.py \
--model_type roberta \
--task_name swag \
--model_name_or_path roberta-base \
--do_train \
--do_eval \
--do_lower_case \
--data_dir $SWAG_DIR \
--learning_rate 5e-5 \
--num_train_epochs 3 \
--max_seq_length 80 \
--output_dir models_bert/swag_base \
--per_gpu_eval_batch_size=16 \
--per_gpu_train_batch_size=16 \
--gradient_accumulation_steps 2 \
--overwrite_output
-```
-Training with the defined hyper-parameters yields the following results:
-```
-***** Eval results *****
-eval_acc = 0.8338998300509847
-eval_loss = 0.44457291918821606
-```
-
-## SQuAD
-
-Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py).
-
-#### Fine-tuning BERT on SQuAD1.0
-
-This example code fine-tunes BERT on the SQuAD1.0 dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large)
-on a single tesla V100 16GB. The data for SQuAD can be downloaded with the following links and should be saved in a
-$SQUAD_DIR directory.
-
-* [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
-* [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
-* [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
-
-And for SQuAD2.0, you need to download:
-
- [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
- [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
- [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
-
-```bash
-export SQUAD_DIR=/path/to/SQUAD
-
-python run_squad.py \
-  --model_type bert \
-  --model_name_or_path bert-base-uncased \
-  --do_train \
-  --do_eval \
-  --do_lower_case \
-  --train_file $SQUAD_DIR/train-v1.1.json \
-  --predict_file $SQUAD_DIR/dev-v1.1.json \
-  --per_gpu_train_batch_size 12 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2.0 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/debug_squad/
-```
-
-Training with the previously defined hyper-parameters yields the following results:
-
-```bash
-f1 = 88.52
-exact_match = 81.22
-```
-
-#### Distributed training
-
-
-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \
-    --model_type bert \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --train_file $SQUAD_DIR/train-v1.1.json \
-    --predict_file $SQUAD_DIR/dev-v1.1.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ./examples/models/wwm_uncased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=3   \
-    --per_gpu_train_batch_size=3   \
-```
-
-Training with the previously defined hyper-parameters yields the following results:
-
-```bash
-f1 = 93.15
-exact_match = 86.91
-```
-
-This fine-tuned model is available as a checkpoint under the reference
-`bert-large-uncased-whole-word-masking-finetuned-squad`.
-
-#### Fine-tuning XLNet on SQuAD
-
-This example code fine-tunes XLNet on both SQuAD1.0 and SQuAD2.0 dataset. See above to download the data for SQuAD .
-
-##### Command for SQuAD1.0:
-
-```bash
-export SQUAD_DIR=/path/to/SQUAD
-
-python run_squad.py \
-    --model_type xlnet \
-    --model_name_or_path xlnet-large-cased \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --train_file $SQUAD_DIR/train-v1.1.json \
-    --predict_file $SQUAD_DIR/dev-v1.1.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ./wwm_cased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=4  \
-    --per_gpu_train_batch_size=4   \
-    --save_steps 5000
-```
-
-##### Command for SQuAD2.0:
-
-```bash
-export SQUAD_DIR=/path/to/SQUAD
-
-python run_squad.py \
-    --model_type xlnet \
-    --model_name_or_path xlnet-large-cased \
-    --do_train \
-    --do_eval \
-    --version_2_with_negative \
-    --train_file $SQUAD_DIR/train-v2.0.json \
-    --predict_file $SQUAD_DIR/dev-v2.0.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 4 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ./wwm_cased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=2  \
-    --per_gpu_train_batch_size=2   \
-    --save_steps 5000
-```
-
-Larger batch size may improve the performance while costing more memory.
-
-##### Results for SQuAD1.0 with the previously defined hyper-parameters:
-
-```python
-{
-"exact": 85.45884578997162,
-"f1": 92.5974600601065,
-"total": 10570,
-"HasAns_exact": 85.45884578997162,
-"HasAns_f1": 92.59746006010651,
-"HasAns_total": 10570
-}
-```
-
-##### Results for SQuAD2.0 with the previously defined hyper-parameters:
-
-```python
-{
-"exact": 80.4177545691906,
-"f1": 84.07154997729623,
-"total": 11873,
-"HasAns_exact": 76.73751686909581,
-"HasAns_f1": 84.05558584352873,
-"HasAns_total": 5928,
-"NoAns_exact": 84.0874684608915,
-"NoAns_f1": 84.0874684608915,
-"NoAns_total": 5945
-}
-```
-
-
-
-
-## XNLI
-
-Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/blob/master/examples/run_xnli.py).
-
-[XNLI](https://www.nyu.edu/projects/bowman/xnli/) is crowd-sourced dataset based on [MultiNLI](http://www.nyu.edu/projects/bowman/multinli/). It is an evaluation benchmark for cross-lingual text representations. Pairs of text are labeled with textual entailment annotations for 15 different languages (including both high-resource language such as English and low-resource languages such as Swahili).
-
-#### Fine-tuning on XNLI
-
-This example code fine-tunes mBERT (multi-lingual BERT) on the XNLI dataset. It runs in 106 mins
-on a single tesla V100 16GB. The data for XNLI can be downloaded with the following links and should be both saved (and un-zipped) in a
-`$XNLI_DIR` directory.
-
-* [XNLI 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-1.0.zip)
-* [XNLI-MT 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-MT-1.0.zip)
-
-```bash
-export XNLI_DIR=/path/to/XNLI
-
-python run_xnli.py \
-  --model_type bert \
-  --model_name_or_path bert-base-multilingual-cased \
-  --language de \
-  --train_language en \
-  --do_train \
-  --do_eval \
-  --data_dir $XNLI_DIR \
-  --per_gpu_train_batch_size 32 \
-  --learning_rate 5e-5 \
-  --num_train_epochs 2.0 \
-  --max_seq_length 128 \
-  --output_dir /tmp/debug_xnli/ \
-  --save_steps -1
-```
-
-Training with the previously defined hyper-parameters yields the following results on the **test** set:
-
-```bash
-acc = 0.7093812375249501
-```
-
-## MM-IMDb
-
-Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/mm-imdb/run_mmimdb.py).
-
-[MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata.
-
-### Training on MM-IMDb
-
-```
-python run_mmimdb.py \
-    --data_dir /path/to/mmimdb/dataset/ \
-    --model_type bert \
-    --model_name_or_path bert-base-uncased \
-    --output_dir /path/to/save/dir/ \
-    --do_train \
-    --do_eval \
-    --max_seq_len 512 \
-    --gradient_accumulation_steps 20 \
-    --num_image_embeds 3 \
-    --num_train_epochs 100 \
-    --patience 5
-```
-
-## Adversarial evaluation of model performances
-
-Here is an example on evaluating a model using adversarial evaluation of natural language inference with the Heuristic Analysis for NLI Systems (HANS) dataset [McCoy et al., 2019](https://arxiv.org/abs/1902.01007). The example was gracefully provided by [Nafise Sadat Moosavi](https://github.com/ns-moosavi).
-
-The HANS dataset can be downloaded from [this location](https://github.com/tommccoy1/hans).
-
-This is an example of using test_hans.py:
-
-```bash
-export HANS_DIR=path-to-hans
-export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc
-export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py
-
-python examples/hans/test_hans.py \
-        --task_name hans \
-        --model_type $MODEL_TYPE \
-        --do_eval \
-        --do_lower_case \
-        --data_dir $HANS_DIR \
-        --model_name_or_path $MODEL_PATH \
-        --max_seq_length 128 \
-        --output_dir $MODEL_PATH \
-```
-
-This will create the hans_predictions.txt file in MODEL_PATH, which can then be evaluated using hans/evaluate_heur_output.py from the HANS dataset.
-
-The results of the BERT-base model that is trained on MNLI using batch size 8 and the random seed 42 on the HANS dataset is as follows:
-
-```bash
-Heuristic entailed results:
-lexical_overlap: 0.9702
-subsequence: 0.9942
-constituent: 0.9962
-
-Heuristic non-entailed results:
-lexical_overlap: 0.199
-subsequence: 0.0396
-constituent: 0.118
-```
+Feedback and more use cases and benchmarks involving TPUs are welcome, please share with the community.
--- a/examples/adversarial/README.md
+++ b/examples/adversarial/README.md
@@ -0,0 +1,38 @@
+## Adversarial evaluation of model performances
+
+Here is an example on evaluating a model using adversarial evaluation of natural language inference with the Heuristic Analysis for NLI Systems (HANS) dataset [McCoy et al., 2019](https://arxiv.org/abs/1902.01007). The example was gracefully provided by [Nafise Sadat Moosavi](https://github.com/ns-moosavi).
+
+The HANS dataset can be downloaded from [this location](https://github.com/tommccoy1/hans).
+
+This is an example of using test_hans.py:
+
+```bash
+export HANS_DIR=path-to-hans
+export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc
+export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py
+
+python examples/hans/test_hans.py \
+        --task_name hans \
+        --model_type $MODEL_TYPE \
+        --do_eval \
+        --data_dir $HANS_DIR \
+        --model_name_or_path $MODEL_PATH \
+        --max_seq_length 128 \
+        --output_dir $MODEL_PATH \
+```
+
+This will create the hans_predictions.txt file in MODEL_PATH, which can then be evaluated using hans/evaluate_heur_output.py from the HANS dataset.
+
+The results of the BERT-base model that is trained on MNLI using batch size 8 and the random seed 42 on the HANS dataset is as follows:
+
+```bash
+Heuristic entailed results:
+lexical_overlap: 0.9702
+subsequence: 0.9942
+constituent: 0.9962
+
+Heuristic non-entailed results:
+lexical_overlap: 0.199
+subsequence: 0.0396
+constituent: 0.118
+```
--- a/examples/adversarial/hans_processors.py
+++ b/examples/adversarial/hans_processors.py
--- a/examples/adversarial/test_hans.py
+++ b/examples/adversarial/test_hans.py
@@ -65,13 +65,6 @@ except ImportError:

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum(
-    (
-        tuple(conf.pretrained_config_archive_map.keys())
-        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
-    ),
-    (),
-)

 MODEL_CLASSES = {
    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
@@ -255,7 +248,7 @@ def evaluate(args, model, tokenizer, prefix=""):
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

        # multi-gpu eval
-        if args.n_gpu > 1:
+        if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
            model = torch.nn.DataParallel(model)

        # Eval!
@@ -342,8 +335,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            max_length=args.max_seq_length,
            output_mode=output_mode,
            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+            pad_token=tokenizer.pad_token_id,
+            pad_token_segment_id=tokenizer.pad_token_type_id,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -389,7 +382,7 @@ def main():
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--task_name",
--- a/examples/adversarial/utils_hans.py
+++ b/examples/adversarial/utils_hans.py
--- a/examples/benchmarking/plot_csv_file.py
+++ b/examples/benchmarking/plot_csv_file.py
@@ -0,0 +1,113 @@
+import csv
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+
+import matplotlib.pyplot as plt
+from transformers import HfArgumentParser
+
+
+@dataclass
+class PlotArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+
+    csv_file: str = field(metadata={"help": "The csv file to plot."},)
+    plot_along_batch: bool = field(
+        default=False,
+        metadata={"help": "Whether to plot along batch size or sequence lengh. Defaults to sequence length."},
+    )
+    is_time: bool = field(
+        default=False,
+        metadata={"help": "Whether the csv file has time results or memory results. Defaults to memory results."},
+    )
+    is_train: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether the csv file has training results or inference results. Defaults to inference results."
+        },
+    )
+    figure_png_file: Optional[str] = field(
+        default=None, metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
+    )
+
+
+class Plot:
+    def __init__(self, args):
+        self.args = args
+        self.result_dict = defaultdict(lambda: dict(bsz=[], seq_len=[], result={}))
+
+        with open(self.args.csv_file, newline="") as csv_file:
+            reader = csv.DictReader(csv_file)
+            for row in reader:
+                model_name = row["model"]
+                self.result_dict[model_name]["bsz"].append(int(row["batch_size"]))
+                self.result_dict[model_name]["seq_len"].append(int(row["sequence_length"]))
+                self.result_dict[model_name]["result"][(int(row["batch_size"]), int(row["sequence_length"]))] = row[
+                    "result"
+                ]
+
+    def plot(self):
+        fig, ax = plt.subplots()
+        title_str = "Time usage" if self.args.is_time else "Memory usage"
+        title_str = title_str + " for training" if self.args.is_train else title_str + " for inference"
+
+        for model_name in self.result_dict.keys():
+            batch_sizes = sorted(list(set(self.result_dict[model_name]["bsz"])))
+            sequence_lengths = sorted(list(set(self.result_dict[model_name]["seq_len"])))
+            results = self.result_dict[model_name]["result"]
+
+            (x_axis_array, inner_loop_array) = (
+                (batch_sizes, sequence_lengths) if self.args.plot_along_batch else (sequence_lengths, batch_sizes)
+            )
+
+            plt.xlim(min(x_axis_array), max(x_axis_array))
+
+            for inner_loop_value in inner_loop_array:
+                if self.args.plot_along_batch:
+                    y_axis_array = np.asarray([results[(x, inner_loop_value)] for x in x_axis_array], dtype=np.int)
+                else:
+                    y_axis_array = np.asarray([results[(inner_loop_value, x)] for x in x_axis_array], dtype=np.float32)
+
+                ax.set_xscale("log", basex=2)
+                ax.set_yscale("log", basey=10)
+
+                (x_axis_label, inner_loop_label) = (
+                    ("batch_size", "sequence_length in #tokens")
+                    if self.args.plot_along_batch
+                    else ("sequence_length in #tokens", "batch_size")
+                )
+
+                x_axis_array = np.asarray(x_axis_array, np.int)
+                plt.scatter(x_axis_array, y_axis_array, label=f"{model_name} - {inner_loop_label}: {inner_loop_value}")
+                plt.plot(x_axis_array, y_axis_array, "--")
+
+            title_str += f" {model_name} vs."
+
+        title_str = title_str[:-4]
+        y_axis_label = "Time in s" if self.args.is_time else "Memory in MB"
+
+        # plot
+        plt.title(title_str)
+        plt.xlabel(x_axis_label)
+        plt.ylabel(y_axis_label)
+        plt.legend()
+
+        if self.args.figure_png_file is not None:
+            plt.savefig(self.args.figure_png_file)
+        else:
+            plt.show()
+
+
+def main():
+    parser = HfArgumentParser(PlotArguments)
+    plot_args = parser.parse_args_into_dataclasses()[0]
+    plot = Plot(args=plot_args)
+    plot.plot()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/benchmarking/run_benchmark.py
+++ b/examples/benchmarking/run_benchmark.py
@@ -0,0 +1,29 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Benchmarking the library on inference and training """
+
+from transformers import HfArgumentParser, PyTorchBenchmark, PyTorchBenchmarkArguments
+
+
+def main():
+    parser = HfArgumentParser(PyTorchBenchmarkArguments)
+    benchmark_args = parser.parse_args_into_dataclasses()[0]
+    benchmark = PyTorchBenchmark(args=benchmark_args)
+    benchmark.run()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -1,664 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Benchmarking the library on inference and training """
-
-# If checking the tensors placement
-# tf.debugging.set_log_device_placement(True)
-
-import argparse
-import csv
-import timeit
-from time import time
-from typing import List
-
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
-    MemorySummary,
-    is_tf_available,
-    is_torch_available,
-    start_memory_tracing,
-    stop_memory_tracing,
-)
-
-
-if is_tf_available():
-    import tensorflow as tf
-    from transformers import TFAutoModel
-
-if is_torch_available():
-    import torch
-    from transformers import AutoModel
-
-
-input_text = """Bent over their instruments, three hundred Fertilizers were plunged, as
-the Director of Hatcheries and Conditioning entered the room, in the
-
-
-
-scarcely breathing silence, the absent-minded, soliloquizing hum or
-whistle, of absorbed concentration. A troop of newly arrived students,
-very young, pink and callow, followed nervously, rather abjectly, at the
-Director's heels. Each of them carried a notebook, in which, whenever
-the great man spoke, he desperately scribbled. Straight from the
-horse's mouth. It was a rare privilege. The D. H. C. for Central London
-always made a point of personally conducting his new students round
-the various departments.
-
-"Just to give you a general idea," he would explain to them. For of
-course some sort of general idea they must have, if they were to do
-their work intelligently-though as little of one, if they were to be good
-and happy members of society, as possible. For particulars, as every
-one knows, make for virtue and happiness; generalities are intellectu-
-ally necessary evils. Not philosophers but fret-sawyers and stamp col-
-lectors compose the backbone of society.
-
-"To-morrow," he would add, smiling at them with a slightly menacing
-geniality, "you'll be settling down to serious work. You won't have time
-for generalities. Meanwhile ..."
-
-Meanwhile, it was a privilege. Straight from the horse's mouth into the
-notebook. The boys scribbled like mad.
-
-Tall and rather thin but upright, the Director advanced into the room.
-He had a long chin and big rather prominent teeth, just covered, when
-he was not talking, by his full, floridly curved lips. Old, young? Thirty?
-Fifty? Fifty-five? It was hard to say. And anyhow the question didn't
-arise; in this year of stability, A. F. 632, it didn't occur to you to ask it.
-
-"I shall begin at the beginning," said the D.H.C. and the more zealous
-students recorded his intention in their notebooks: Begin at the begin-
-ning. "These," he waved his hand, "are the incubators." And opening
-an insulated door he showed them racks upon racks of numbered test-
-tubes. "The week's supply of ova. Kept," he explained, "at blood heat;
-whereas the male gametes," and here he opened another door, "they
-have to be kept at thirty-five instead of thirty-seven. Full blood heat
-sterilizes." Rams wrapped in theremogene beget no lambs.
-
-Still leaning against the incubators he gave them, while the pencils
-scurried illegibly across the pages, a brief description of the modern
-
-
-
-fertilizing process; spoke first, of course, of its surgical introduc-
-tion-"the operation undergone voluntarily for the good of Society, not
-to mention the fact that it carries a bonus amounting to six months'
-salary"; continued with some account of the technique for preserving
-the excised ovary alive and actively developing; passed on to a consid-
-eration of optimum temperature, salinity, viscosity; referred to the liq-
-uor in which the detached and ripened eggs were kept; and, leading
-his charges to the work tables, actually showed them how this liquor
-was drawn off from the test-tubes; how it was let out drop by drop
-onto the specially warmed slides of the microscopes; how the eggs
-which it contained were inspected for abnormalities, counted and
-transferred to a porous receptacle; how (and he now took them to
-watch the operation) this receptacle was immersed in a warm bouillon
-containing free-swimming spermatozoa-at a minimum concentration
-of one hundred thousand per cubic centimetre, he insisted; and how,
-after ten minutes, the container was lifted out of the liquor and its
-contents re-examined; how, if any of the eggs remained unfertilized, it
-was again immersed, and, if necessary, yet again; how the fertilized
-ova went back to the incubators; where the Alphas and Betas re-
-mained until definitely bottled; while the Gammas, Deltas and Epsilons
-were brought out again, after only thirty-six hours, to undergo Bo-
-kanovsky's Process.
-
-"Bokanovsky's Process," repeated the Director, and the students un-
-derlined the words in their little notebooks.
-
-One egg, one embryo, one adult-normality. But a bokanovskified egg
-will bud, will proliferate, will divide. From eight to ninety-six buds, and
-every bud will grow into a perfectly formed embryo, and every embryo
-into a full-sized adult. Making ninety-six human beings grow where
-only one grew before. Progress.
-
-"Essentially," the D.H.C. concluded, "bokanovskification consists of a
-series of arrests of development. We check the normal growth and,
-paradoxically enough, the egg responds by budding."
-
-Responds by budding. The pencils were busy.
-
-He pointed. On a very slowly moving band a rack-full of test-tubes was
-entering a large metal box, another, rack-full was emerging. Machinery
-faintly purred. It took eight minutes for the tubes to go through, he
-
-
-
-told them. Eight minutes of hard X-rays being about as much as an
-egg can stand. A few died; of the rest, the least susceptible divided
-into two; most put out four buds; some eight; all were returned to the
-incubators, where the buds began to develop; then, after two days,
-were suddenly chilled, chilled and checked. Two, four, eight, the buds
-in their turn budded; and having budded were dosed almost to death
-with alcohol; consequently burgeoned again and having budded-bud
-out of bud out of bud-were thereafter-further arrest being generally
-fatal-left to develop in peace. By which time the original egg was in a
-fair way to becoming anything from eight to ninety-six embryos- a
-prodigious improvement, you will agree, on nature. Identical twins-but
-not in piddling twos and threes as in the old viviparous days, when an
-egg would sometimes accidentally divide; actually by dozens, by
-scores at a time.
-
-"Scores," the Director repeated and flung out his arms, as though he
-were distributing largesse. "Scores."
-
-But one of the students was fool enough to ask where the advantage
-lay.
-
-"My good boy!" The Director wheeled sharply round on him. "Can't you
-see? Can't you see?" He raised a hand; his expression was solemn.
-"Bokanovsky's Process is one of the major instruments of social stabil-
-ity!"
-
-Major instruments of social stability.
-
-Standard men and women; in uniform batches. The whole of a small
-factory staffed with the products of a single bokanovskified egg.
-
-"Ninety-six identical twins working ninety-six identical machines!" The
-voice was almost tremulous with enthusiasm. "You really know where
-you are. For the first time in history." He quoted the planetary motto.
-"Community, Identity, Stability." Grand words. "If we could bo-
-kanovskify indefinitely the whole problem would be solved."
-
-Solved by standard Gammas, unvarying Deltas, uniform Epsilons. Mil-
-lions of identical twins. The principle of mass production at last applied
-to biology.
-
-
-
-"But, alas," the Director shook his head, "we can't bokanovskify indefi-
-nitely."
-
-Ninety-six seemed to be the limit; seventy-two a good average. From
-the same ovary and with gametes of the same male to manufacture as
-many batches of identical twins as possible-that was the best (sadly a
-second best) that they could do. And even that was difficult.
-
-"For in nature it takes thirty years for two hundred eggs to reach ma-
-turity. But our business is to stabilize the population at this moment,
-here and now. Dribbling out twins over a quarter of a century-what
-would be the use of that?"
-
-Obviously, no use at all. But Podsnap's Technique had immensely ac-
-celerated the process of ripening. They could make sure of at least a
-hundred and fifty mature eggs within two years. Fertilize and bo-
-kanovskify-in other words, multiply by seventy-two-and you get an
-average of nearly eleven thousand brothers and sisters in a hundred
-and fifty batches of identical twins, all within two years of the same
-age.
-
-"And in exceptional cases we can make one ovary yield us over fifteen
-thousand adult individuals."
-
-Beckoning to a fair-haired, ruddy young man who happened to be
-passing at the moment. "Mr. Foster," he called. The ruddy young man
-approached. "Can you tell us the record for a single ovary, Mr. Foster?"
-
-"Sixteen thousand and twelve in this Centre," Mr. Foster replied with-
-out hesitation. He spoke very quickly, had a vivacious blue eye, and
-took an evident pleasure in quoting figures. "Sixteen thousand and
-twelve; in one hundred and eighty-nine batches of identicals. But of
-course they've done much better," he rattled on, "in some of the tropi-
-cal Centres. Singapore has often produced over sixteen thousand five
-hundred; and Mombasa has actually touched the seventeen thousand
-mark. But then they have unfair advantages. You should see the way a
-negro ovary responds to pituitary! It's quite astonishing, when you're
-used to working with European material. Still," he added, with a laugh
-(but the light of combat was in his eyes and the lift of his chin was
-challenging), "still, we mean to beat them if we can. I'm working on a
-wonderful Delta-Minus ovary at this moment. Only just eighteen
-
-
-
-months old. Over twelve thousand seven hundred children already, ei-
-ther decanted or in embryo. And still going strong. We'll beat them
-yet."
-
-"That's the spirit I like!" cried the Director, and clapped Mr. Foster on
-the shoulder. "Come along with us, and give these boys the benefit of
-your expert knowledge."
-
-Mr. Foster smiled modestly. "With pleasure." They went.
-In the Bottling Room all was harmonious bustle and ordered activity.
-Flaps of fresh sow's peritoneum ready cut to the proper size came
-shooting up in little lifts from the Organ Store in the sub-basement.
-Whizz and then, click! the lift-hatches hew open; the bottle-liner had
-only to reach out a hand, take the flap, insert, smooth-down, and be-
-fore the lined bottle had had time to travel out of reach along the end-
-less band, whizz, click! another flap of peritoneum had shot up from
-the depths, ready to be slipped into yet another bottle, the next of that
-slow interminable procession on the band.
-
-Next to the Liners stood the Matriculators. The procession advanced;
-one by one the eggs were transferred from their test-tubes to the
-larger containers; deftly the peritoneal lining was slit, the morula
-dropped into place, the saline solution poured in ... and already the
-bottle had passed, and it was the turn of the labellers. Heredity, date
-of fertilization, membership of Bokanovsky Group-details were trans-
-ferred from test-tube to bottle. No longer anonymous, but named,
-identified, the procession marched slowly on; on through an opening in
-the wall, slowly on into the Social Predestination Room.
-"Eighty-eight cubic metres of card-index," said Mr. Foster with relish,
-as they entered."""
-
-
-def create_setup_and_compute(
-    model_names: List[str],
-    batch_sizes: List[int],
-    slice_sizes: List[int],
-    gpu: bool = True,
-    tensorflow: bool = False,
-    average_over: int = 3,
-    no_speed: bool = False,
-    no_memory: bool = False,
-    verbose: bool = False,
-    torchscript: bool = False,
-    xla: bool = False,
-    amp: bool = False,
-    fp16: bool = False,
-    save_to_csv: bool = False,
-    csv_filename: str = f"results_{round(time())}.csv",
-    csv_memory_filename: str = f"memory_{round(time())}.csv",
-):
-    if xla:
-        tf.config.optimizer.set_jit(True)
-    if amp:
-        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
-
-    if tensorflow:
-        dictionary = {model_name: {} for model_name in model_names}
-        results = _compute_tensorflow(
-            model_names, batch_sizes, slice_sizes, dictionary, average_over, amp, no_speed, no_memory, verbose
-        )
-    else:
-        device = "cuda" if (gpu and torch.cuda.is_available()) else "cpu"
-        dictionary = {model_name: {} for model_name in model_names}
-        results = _compute_pytorch(
-            model_names,
-            batch_sizes,
-            slice_sizes,
-            dictionary,
-            average_over,
-            device,
-            torchscript,
-            fp16,
-            no_speed,
-            no_memory,
-            verbose,
-        )
-
-    print("=========== RESULTS ===========")
-    for model_name in model_names:
-        print("\t" + f"======= MODEL CHECKPOINT: {model_name} =======")
-        for batch_size in results[model_name]["bs"]:
-            print("\t\t" + f"===== BATCH SIZE: {batch_size} =====")
-            for slice_size in results[model_name]["ss"]:
-                result = results[model_name]["results"][batch_size][slice_size]
-                memory = results[model_name]["memory"][batch_size][slice_size]
-                if isinstance(result, str):
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result} " f"{memory}")
-                else:
-                    print(
-                        f"\t\t{model_name}/{batch_size}/{slice_size}: "
-                        f"{(round(1000 * result) / 1000)}"
-                        f"s "
-                        f"{memory}"
-                    )
-
-    if save_to_csv:
-        with open(csv_filename, mode="w") as csv_file, open(csv_memory_filename, mode="w") as csv_memory_file:
-            fieldnames = [
-                "model",
-                "1x8",
-                "1x64",
-                "1x128",
-                "1x256",
-                "1x512",
-                "1x1024",
-                "2x8",
-                "2x64",
-                "2x128",
-                "2x256",
-                "2x512",
-                "2x1024",
-                "4x8",
-                "4x64",
-                "4x128",
-                "4x256",
-                "4x512",
-                "4x1024",
-                "8x8",
-                "8x64",
-                "8x128",
-                "8x256",
-                "8x512",
-                "8x1024",
-            ]
-
-            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
-            writer.writeheader()
-            memory_writer = csv.DictWriter(csv_memory_file, fieldnames=fieldnames)
-            memory_writer.writeheader()
-
-            for model_name in model_names:
-                model_results = {
-                    f"{bs}x{ss}": results[model_name]["results"][bs][ss]
-                    for bs in results[model_name]["results"]
-                    for ss in results[model_name]["results"][bs]
-                }
-                writer.writerow({"model": model_name, **model_results})
-
-                model_memory_results = {
-                    f"{bs}x{ss}": results[model_name]["memory"][bs][ss]
-                    for bs in results[model_name]["memory"]
-                    for ss in results[model_name]["memory"][bs]
-                }
-                memory_writer.writerow({"model": model_name, **model_memory_results})
-
-
-def print_summary_statistics(summary: MemorySummary):
-    print(
-        "\nLines by line memory consumption:\n"
-        + "\n".join(
-            f"{state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
-            for state in summary.sequential
-        )
-    )
-    print(
-        "\nLines with top memory consumption:\n"
-        + "\n".join(
-            f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
-            for state in summary.cumulative[:6]
-        )
-    )
-    print(
-        "\nLines with lowest memory consumption:\n"
-        + "\n".join(
-            f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
-            for state in summary.cumulative[-6:]
-        )
-    )
-    print(f"\nTotal memory increase: {summary.total}")
-
-
-def _compute_pytorch(
-    model_names,
-    batch_sizes,
-    slice_sizes,
-    dictionary,
-    average_over,
-    device,
-    torchscript,
-    fp16,
-    no_speed,
-    no_memory,
-    verbose,
-):
-    for c, model_name in enumerate(model_names):
-        print(f"{c + 1} / {len(model_names)}")
-        config = AutoConfig.from_pretrained(model_name, torchscript=torchscript)
-        model = AutoModel.from_pretrained(model_name, config=config)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
-
-        max_input_size = tokenizer.max_model_input_sizes[model_name]
-
-        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}, "memory": {}}
-        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
-        dictionary[model_name]["memory"] = {i: {} for i in batch_sizes}
-
-        for batch_size in batch_sizes:
-            if fp16:
-                model.half()
-            model.to(device)
-            model.eval()
-
-            for slice_size in slice_sizes:
-                if max_input_size is not None and slice_size > max_input_size:
-                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-                else:
-                    sequence = torch.tensor(tokenized_sequence[:slice_size], device=device).repeat(batch_size, 1)
-                    try:
-                        if torchscript:
-                            print("Tracing model with sequence size", sequence.shape)
-                            inference = torch.jit.trace(model, sequence)
-                            inference(sequence)
-                        else:
-                            inference = model
-                            inference(sequence)
-
-                        if not no_memory:
-                            # model.add_memory_hooks()  # Forward method tracing (only for PyTorch models)
-
-                            # Line by line memory tracing (all code in the module `transformers`) works for all models/arbitrary code
-                            trace = start_memory_tracing("transformers")
-                            inference(sequence)
-                            summary = stop_memory_tracing(trace)
-
-                            if verbose:
-                                print_summary_statistics(summary)
-
-                            dictionary[model_name]["memory"][batch_size][slice_size] = str(summary.total)
-                        else:
-                            dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
-
-                        if not no_speed:
-                            print("Going through model with sequence of shape", sequence.shape)
-                            runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                            average_time = sum(runtimes) / float(len(runtimes)) / 3.0
-                            dictionary[model_name]["results"][batch_size][slice_size] = average_time
-                        else:
-                            dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-
-                    except RuntimeError as e:
-                        print("Doesn't fit on GPU.", e)
-                        torch.cuda.empty_cache()
-                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-                        dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
-    return dictionary
-
-
-def _compute_tensorflow(
-    model_names, batch_sizes, slice_sizes, dictionary, average_over, amp, no_speed, no_memory, verbose
-):
-    for c, model_name in enumerate(model_names):
-        print(f"{c + 1} / {len(model_names)}")
-        config = AutoConfig.from_pretrained(model_name)
-        model = TFAutoModel.from_pretrained(model_name, config=config)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
-
-        max_input_size = tokenizer.max_model_input_sizes[model_name]
-
-        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}, "memory": {}}
-        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
-        dictionary[model_name]["memory"] = {i: {} for i in batch_sizes}
-
-        print("Using model", model)
-
-        @tf.function
-        def inference(inputs):
-            return model(inputs)
-
-        for batch_size in batch_sizes:
-            for slice_size in slice_sizes:
-                if max_input_size is not None and slice_size > max_input_size:
-                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-                else:
-                    sequence = tf.stack(
-                        [tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size
-                    )
-
-                    try:
-                        print("Going through model with sequence of shape", sequence.shape)
-                        # To make sure that the model is traced + that the tensors are on the appropriate device
-                        inference(sequence)
-
-                        if not no_memory:
-                            # Line by line memory tracing (all code in the module `transformers`) works for all models/arbitrary code
-                            trace = start_memory_tracing("transformers")
-                            inference(sequence)
-                            summary = stop_memory_tracing(trace)
-
-                            if verbose:
-                                print_summary_statistics(summary)
-
-                            dictionary[model_name]["memory"][batch_size][slice_size] = str(summary.total)
-                        else:
-                            dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
-
-                        if not no_speed:
-                            runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                            average_time = sum(runtimes) / float(len(runtimes)) / 3.0
-                            dictionary[model_name]["results"][batch_size][slice_size] = average_time
-                        else:
-                            dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-
-                    except tf.errors.ResourceExhaustedError as e:
-                        print("Doesn't fit on GPU.", e)
-                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
-                        dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
-    return dictionary
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--models",
-        required=False,
-        type=str,
-        default="all",
-        help="Model checkpoints to be provided "
-        "to the AutoModel classes. Leave "
-        "blank to benchmark the base version "
-        "of all available model "
-        "architectures.",
-    )
-    parser.add_argument("--verbose", required=False, action="store_true", help="Verbose memory tracing")
-    parser.add_argument("--no_speed", required=False, action="store_true", help="Don't perform speed measurments")
-    parser.add_argument("--no_memory", required=False, action="store_true", help="Don't perform memory measurments")
-    parser.add_argument(
-        "--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the " "models"
-    )
-    parser.add_argument(
-        "--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available " "cuda devices"
-    )
-    parser.add_argument(
-        "--torchscript",
-        required=False,
-        action="store_true",
-        help="Pytorch only: trace the models " "using torchscript",
-    )
-    parser.add_argument(
-        "--tensorflow",
-        required=False,
-        action="store_true",
-        help="Benchmark the TensorFlow version "
-        "of the models. Will run on GPU if "
-        "the correct dependencies are "
-        "installed",
-    )
-    parser.add_argument("--xla", required=False, action="store_true", help="TensorFlow only: use XLA acceleration.")
-    parser.add_argument(
-        "--amp",
-        required=False,
-        action="store_true",
-        help="TensorFlow only: use automatic mixed precision acceleration.",
-    )
-    parser.add_argument(
-        "--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference."
-    )
-    parser.add_argument(
-        "--keras_predict",
-        required=False,
-        action="store_true",
-        help="Whether to use model.predict " "instead of model() to do a " "forward pass.",
-    )
-    parser.add_argument("--save_to_csv", required=False, action="store_true", help="Save to a CSV file.")
-    parser.add_argument(
-        "--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv."
-    )
-    parser.add_argument(
-        "--average_over", required=False, default=30, type=int, help="Times an experiment will be run."
-    )
-    parser.add_argument("--batch_sizes", nargs="+", type=int, default=[1, 2, 4, 8])
-    parser.add_argument("--slice_sizes", nargs="+", type=int, default=[8, 64, 128, 256, 512, 1024])
-
-    args = parser.parse_args()
-    if args.models == "all":
-        args.models = [
-            "gpt2",
-            "bert-base-cased",
-            "xlnet-base-cased",
-            "xlm-mlm-en-2048",
-            "transfo-xl-wt103",
-            "openai-gpt",
-            "distilbert-base-uncased",
-            "distilgpt2",
-            "roberta-base",
-            "ctrl",
-        ]
-    else:
-        args.models = args.models.split()
-
-    print("Running with arguments", args)
-
-    if args.torch:
-        if is_torch_available():
-            create_setup_and_compute(
-                model_names=args.models,
-                batch_sizes=args.batch_sizes,
-                slice_sizes=args.slice_sizes,
-                tensorflow=False,
-                gpu=args.torch_cuda,
-                torchscript=args.torchscript,
-                fp16=args.fp16,
-                save_to_csv=args.save_to_csv,
-                csv_filename=args.csv_filename,
-                average_over=args.average_over,
-                no_speed=args.no_speed,
-                no_memory=args.no_memory,
-                verbose=args.verbose,
-            )
-        else:
-            raise ImportError("Trying to run a PyTorch benchmark but PyTorch was not found in the environment.")
-
-    if args.tensorflow:
-        if is_tf_available():
-            create_setup_and_compute(
-                model_names=args.models,
-                batch_sizes=args.batch_sizes,
-                slice_sizes=args.slice_sizes,
-                tensorflow=True,
-                xla=args.xla,
-                amp=args.amp,
-                save_to_csv=args.save_to_csv,
-                csv_filename=args.csv_filename,
-                average_over=args.average_over,
-                no_speed=args.no_speed,
-                no_memory=args.no_memory,
-                verbose=args.verbose,
-            )
-        else:
-            raise ImportError("Trying to run a TensorFlow benchmark but TensorFlow was not found in the environment.")
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/bertology/run_bertology.py
+++ b/examples/bertology/run_bertology.py
@@ -30,10 +30,17 @@ from torch.utils.data import DataLoader, SequentialSampler, Subset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm

-from run_glue import ALL_MODELS, MODEL_CLASSES, load_and_cache_examples, set_seed
-from transformers import glue_compute_metrics as compute_metrics
-from transformers import glue_output_modes as output_modes
-from transformers import glue_processors as processors
+from transformers import (
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    DefaultDataCollator,
+    GlueDataset,
+    glue_compute_metrics,
+    glue_output_modes,
+    glue_processors,
+    set_seed,
+)


 logger = logging.getLogger(__name__)
@@ -57,32 +64,35 @@ def print_2d_tensor(tensor):


 def compute_heads_importance(
-    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None
+    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None, actually_pruned=False
 ):
    """ This method shows how to compute:
        - head attention entropy
        - head importance scores according to http://arxiv.org/abs/1905.10650
    """
    # Prepare our tensors
-    n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads
+    n_layers, n_heads = model.config.num_hidden_layers, model.config.num_attention_heads
    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)

    if head_mask is None:
        head_mask = torch.ones(n_layers, n_heads).to(args.device)
+
    head_mask.requires_grad_(requires_grad=True)
+    # If actually pruned attention multi-head, set head mask to None to avoid shape mismatch
+    if actually_pruned:
+        head_mask = None
+
    preds = None
    labels = None
    tot_tokens = 0.0

-    for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-        batch = tuple(t.to(args.device) for t in batch)
-        input_ids, input_mask, segment_ids, label_ids = batch
+    for step, inputs in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+        for k, v in inputs.items():
+            inputs[k] = v.to(args.device)

        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
-        outputs = model(
-            input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask
-        )
+        outputs = model(**inputs, head_mask=head_mask)
        loss, logits, all_attentions = (
            outputs[0],
            outputs[1],
@@ -92,7 +102,7 @@ def compute_heads_importance(

        if compute_entropy:
            for layer, attn in enumerate(all_attentions):
-                masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1)
+                masked_entropy = entropy(attn.detach()) * inputs["attention_mask"].float().unsqueeze(1)
                attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()

        if compute_importance:
@@ -101,12 +111,12 @@ def compute_heads_importance(
        # Also store our logits/labels if we want to compute metrics afterwards
        if preds is None:
            preds = logits.detach().cpu().numpy()
-            labels = label_ids.detach().cpu().numpy()
+            labels = inputs["labels"].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-            labels = np.append(labels, label_ids.detach().cpu().numpy(), axis=0)
+            labels = np.append(labels, inputs["labels"].detach().cpu().numpy(), axis=0)

-        tot_tokens += input_mask.float().detach().sum().data
+        tot_tokens += inputs["attention_mask"].float().detach().sum().data

    # Normalize
    attn_entropy /= tot_tokens
@@ -145,7 +155,7 @@ def mask_heads(args, model, eval_dataloader):
    """
    _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-    original_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    original_score = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
    logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)

    new_head_mask = torch.ones_like(head_importance)
@@ -167,6 +177,7 @@ def mask_heads(args, model, eval_dataloader):
        new_head_mask = new_head_mask.view(-1)
        new_head_mask[current_heads_to_mask] = 0.0
        new_head_mask = new_head_mask.view_as(head_mask)
+        new_head_mask = new_head_mask.clone().detach()
        print_2d_tensor(new_head_mask)

        # Compute metric and head importance again
@@ -174,9 +185,9 @@ def mask_heads(args, model, eval_dataloader):
            args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask
        )
        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-        current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+        current_score = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
        logger.info(
-            "Masking: current score: %f, remaning heads %d (%.1f percents)",
+            "Masking: current score: %f, remaining heads %d (%.1f percents)",
            current_score,
            new_head_mask.sum(),
            new_head_mask.sum() / new_head_mask.numel() * 100,
@@ -200,21 +211,30 @@ def prune_heads(args, model, eval_dataloader, head_mask):
        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask
    )
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-    score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    score_masking = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
    original_time = datetime.now() - before_time

    original_num_params = sum(p.numel() for p in model.parameters())
-    heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask)))
+    heads_to_prune = dict(
+        (layer, (1 - head_mask[layer].long()).nonzero().squeeze().tolist()) for layer in range(len(head_mask))
+    )
+
    assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
    model.prune_heads(heads_to_prune)
    pruned_num_params = sum(p.numel() for p in model.parameters())

    before_time = datetime.now()
    _, _, preds, labels = compute_heads_importance(
-        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=None
+        args,
+        model,
+        eval_dataloader,
+        compute_entropy=False,
+        compute_importance=False,
+        head_mask=None,
+        actually_pruned=True,
    )
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-    score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    score_pruning = glue_compute_metrics(args.task_name, preds, labels)[args.metric_name]
    new_time = datetime.now() - before_time

    logger.info(
@@ -242,14 +262,14 @@ def main():
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
-        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+        help="The name of the task to train selected in the list: " + ", ".join(glue_processors.keys()),
    )
    parser.add_argument(
        "--output_dir",
@@ -274,7 +294,7 @@ def main():
    )
    parser.add_argument(
        "--cache_dir",
-        default="",
+        default=None,
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )
@@ -350,48 +370,40 @@ def main():
    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1)))

    # Set seeds
-    set_seed(args)
+    set_seed(args.seed)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
-    if args.task_name not in processors:
+    if args.task_name not in glue_processors:
        raise ValueError("Task not found: %s" % (args.task_name))
-    processor = processors[args.task_name]()
-    args.output_mode = output_modes[args.task_name]
+    processor = glue_processors[args.task_name]()
+    args.output_mode = glue_output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.

-    args.model_type = ""
-    for key in MODEL_CLASSES:
-        if key in args.model_name_or_path.lower():
-            args.model_type = key  # take the first match in model types
-            break
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(
+    config = AutoConfig.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        output_attentions=True,
-        cache_dir=args.cache_dir if args.cache_dir else None,
+        cache_dir=args.cache_dir,
    )
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        cache_dir=args.cache_dir if args.cache_dir else None,
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir,
    )
-    model = model_class.from_pretrained(
+    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
+        cache_dir=args.cache_dir,
    )

-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
    # Distributed and parallel training
    model.to(args.device)
    if args.local_rank != -1:
@@ -402,15 +414,18 @@ def main():
        model = torch.nn.DataParallel(model)

    # Print/save training arguments
+    os.makedirs(args.output_dir, exist_ok=True)
    torch.save(args, os.path.join(args.output_dir, "run_args.bin"))
    logger.info("Training/evaluation parameters %s", args)

    # Prepare dataset for the GLUE task
-    eval_data = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)
+    eval_dataset = GlueDataset(args, tokenizer=tokenizer, mode="dev")
    if args.data_subset > 0:
-        eval_data = Subset(eval_data, list(range(min(args.data_subset, len(eval_data)))))
-    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
-    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
+        eval_dataset = Subset(eval_dataset, list(range(min(args.data_subset, len(eval_dataset)))))
+    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+    eval_dataloader = DataLoader(
+        eval_dataset, sampler=eval_sampler, batch_size=args.batch_size, collate_fn=DefaultDataCollator().collate_batch
+    )

    # Compute head entropy and importance score
    compute_heads_importance(args, model, eval_dataloader)
--- a/examples/contrib/mm-imdb/README.md
+++ b/examples/contrib/mm-imdb/README.md
@@ -0,0 +1,23 @@
+## MM-IMDb
+
+Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/contrib/mm-imdb/run_mmimdb.py).
+
+[MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata.
+
+### Training on MM-IMDb
+
+```
+python run_mmimdb.py \
+    --data_dir /path/to/mmimdb/dataset/ \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --output_dir /path/to/save/dir/ \
+    --do_train \
+    --do_eval \
+    --max_seq_len 512 \
+    --gradient_accumulation_steps 20 \
+    --num_image_embeds 3 \
+    --num_train_epochs 100 \
+    --patience 5
+```
+
--- a/examples/contrib/mm-imdb/run_mmimdb.py
+++ b/examples/contrib/mm-imdb/run_mmimdb.py
@@ -34,26 +34,11 @@ from tqdm import tqdm, trange
 from transformers import (
    WEIGHTS_NAME,
    AdamW,
-    AlbertConfig,
-    AlbertModel,
-    AlbertTokenizer,
-    BertConfig,
-    BertModel,
-    BertTokenizer,
-    DistilBertConfig,
-    DistilBertModel,
-    DistilBertTokenizer,
+    AutoConfig,
+    AutoModel,
+    AutoTokenizer,
    MMBTConfig,
    MMBTForClassification,
-    RobertaConfig,
-    RobertaModel,
-    RobertaTokenizer,
-    XLMConfig,
-    XLMModel,
-    XLMTokenizer,
-    XLNetConfig,
-    XLNetModel,
-    XLNetTokenizer,
    get_linear_schedule_with_warmup,
 )
 from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_transforms, get_mmimdb_labels
@@ -67,23 +52,6 @@ except ImportError:

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum(
-    (
-        tuple(conf.pretrained_config_archive_map.keys())
-        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
-    ),
-    (),
-)
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertModel, BertTokenizer),
-    "xlnet": (XLNetConfig, XLNetModel, XLNetTokenizer),
-    "xlm": (XLMConfig, XLMModel, XLMTokenizer),
-    "roberta": (RobertaConfig, RobertaModel, RobertaTokenizer),
-    "distilbert": (DistilBertConfig, DistilBertModel, DistilBertTokenizer),
-    "albert": (AlbertConfig, AlbertModel, AlbertTokenizer),
-}
-

 def set_seed(args):
    random.seed(args.seed)
@@ -278,7 +246,7 @@ def evaluate(args, model, tokenizer, criterion, prefix=""):
    )

    # multi-gpu eval
-    if args.n_gpu > 1:
+    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
@@ -351,19 +319,12 @@ def main():
        required=True,
        help="The input data dir. Should contain the .jsonl files for MMIMDB.",
    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--output_dir",
@@ -385,7 +346,7 @@ def main():
    )
    parser.add_argument(
        "--cache_dir",
-        default="",
+        default=None,
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )
@@ -526,18 +487,14 @@ def main():
    # Setup model
    labels = get_mmimdb_labels()
    num_labels = len(labels)
-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    transformer_config = config_class.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path
-    )
-    tokenizer = tokenizer_class.from_pretrained(
+    transformer_config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
+        cache_dir=args.cache_dir,
    )
-    transformer = model_class.from_pretrained(
-        args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir if args.cache_dir else None
+    transformer = AutoModel.from_pretrained(
+        args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir
    )
    img_encoder = ImageEncoder(args)
    config = MMBTConfig(transformer_config, num_labels=num_labels)
@@ -583,13 +540,12 @@ def main():
        # Load a trained model and vocabulary that you have fine-tuned
        model = MMBTForClassification(config, transformer, img_encoder)
        model.load_state_dict(torch.load(os.path.join(args.output_dir, WEIGHTS_NAME)))
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
--- a/examples/contrib/mm-imdb/utils_mmimdb.py
+++ b/examples/contrib/mm-imdb/utils_mmimdb.py
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -31,14 +31,8 @@ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Tenso
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange

-from transformers import (
-    WEIGHTS_NAME,
-    AdamW,
-    BertConfig,
-    BertForMultipleChoice,
-    BertTokenizer,
-    get_linear_schedule_with_warmup,
-)
+from transformers import WEIGHTS_NAME, AdamW, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup
+from transformers.modeling_auto import AutoModelForMultipleChoice


 try:
@@ -49,12 +43,6 @@ except ImportError:

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in [BertConfig]), ())
-
-MODEL_CLASSES = {
-    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
-}
-

 class SwagExample(object):
    """A single training/test example for the SWAG dataset."""
@@ -492,19 +480,12 @@ def main():
        required=True,
        help="SWAG csv for predictions. E.g., val.csv or test.csv",
    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
-    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--output_dir",
@@ -536,9 +517,6 @@ def main():
    parser.add_argument(
        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
@@ -652,13 +630,9 @@ def main():
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

-    args.model_type = args.model_type.lower()
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = tokenizer_class.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case
-    )
-    model = model_class.from_pretrained(
+    config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,)
+    model = AutoModelForMultipleChoice.from_pretrained(
        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
    )

@@ -694,8 +668,8 @@ def main():
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model = AutoModelForMultipleChoice.from_pretrained(args.output_dir)
+        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
@@ -718,8 +692,8 @@ def main():
        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            model = model_class.from_pretrained(checkpoint)
-            tokenizer = tokenizer_class.from_pretrained(checkpoint)
+            model = AutoModelForMultipleChoice.from_pretrained(checkpoint)
+            tokenizer = AutoTokenizer.from_pretrained(checkpoint)
            model.to(args.device)

            # Evaluate
--- a/examples/contrib/run_transfo_xl.py
+++ b/examples/contrib/run_transfo_xl.py
@@ -80,7 +80,7 @@ def main():

    # Load a pre-trained model
    model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
-    model = model.to(device)
+    model.to(device)

    logger.info(
        "Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}".format(
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -80,7 +80,7 @@ class Distiller:

        self.mlm = params.mlm
        if self.mlm:
-            logger.info(f"Using MLM loss for LM step.")
+            logger.info("Using MLM loss for LM step.")
            self.mlm_mask_prop = params.mlm_mask_prop
            assert 0.0 <= self.mlm_mask_prop <= 1.0
            assert params.word_mask + params.word_keep + params.word_rand == 1.0
@@ -91,7 +91,7 @@ class Distiller:
                self.pred_probs = self.pred_probs.half()
                self.token_probs = self.token_probs.half()
        else:
-            logger.info(f"Using CLM loss for LM step.")
+            logger.info("Using CLM loss for LM step.")

        self.epoch = 0
        self.n_iter = 0
@@ -365,8 +365,8 @@ class Distiller:
            self.end_epoch()

        if self.is_master:
-            logger.info(f"Save very last checkpoint as `pytorch_model.bin`.")
-            self.save_checkpoint(checkpoint_name=f"pytorch_model.bin")
+            logger.info("Save very last checkpoint as `pytorch_model.bin`.")
+            self.save_checkpoint(checkpoint_name="pytorch_model.bin")
            logger.info("Training is finished")

    def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor):
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" This is the exact same script as `examples/run_squad.py` (as of 2020, January 8th) with an additional and optional step of distillation."""
+""" This is the exact same script as `examples/question-answering/run_squad.py` (as of 2020, January 8th) with an additional and optional step of distillation."""

 import argparse
 import glob
@@ -67,9 +67,6 @@ except ImportError:

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ()
-)

 MODEL_CLASSES = {
    "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer),
@@ -505,7 +502,7 @@ def main():
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--output_dir",
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -60,7 +60,7 @@ def main():
    with open(args.file_path, "r", encoding="utf8") as fp:
        data = fp.readlines()

-    logger.info(f"Start encoding")
+    logger.info("Start encoding")
    logger.info(f"{len(data)} examples to process.")

    rslt = []
--- a/examples/distillation/scripts/extract.py
+++ b/examples/distillation/scripts/extract.py
@@ -93,7 +93,7 @@ if __name__ == "__main__":
    elif args.model_type == "gpt2":
        for w in ["weight", "bias"]:
            compressed_sd[f"{prefix}.ln_f.{w}"] = state_dict[f"{prefix}.ln_f.{w}"]
-        compressed_sd[f"lm_head.weight"] = state_dict[f"lm_head.weight"]
+        compressed_sd["lm_head.weight"] = state_dict["lm_head.weight"]

    print(f"N layers selected for distillation: {std_idx}")
    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
--- a/examples/distillation/scripts/extract_distilbert.py
+++ b/examples/distillation/scripts/extract_distilbert.py
@@ -37,7 +37,7 @@ if __name__ == "__main__":
        model = BertForMaskedLM.from_pretrained(args.model_name)
        prefix = "bert"
    else:
-        raise ValueError(f'args.model_type should be "bert".')
+        raise ValueError('args.model_type should be "bert".')

    state_dict = model.state_dict()
    compressed_sd = {}
@@ -78,8 +78,8 @@ if __name__ == "__main__":
            ]
        std_idx += 1

-    compressed_sd[f"vocab_projector.weight"] = state_dict[f"cls.predictions.decoder.weight"]
-    compressed_sd[f"vocab_projector.bias"] = state_dict[f"cls.predictions.bias"]
+    compressed_sd["vocab_projector.weight"] = state_dict["cls.predictions.decoder.weight"]
+    compressed_sd["vocab_projector.bias"] = state_dict["cls.predictions.bias"]
    if args.vocab_transform:
        for w in ["weight", "bias"]:
            compressed_sd[f"vocab_transform.{w}"] = state_dict[f"cls.predictions.transform.dense.{w}"]
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@@ -273,7 +273,7 @@ def main():
        token_probs = None

    train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
-    logger.info(f"Data loader created.")
+    logger.info("Data loader created.")

    # STUDENT #
    logger.info(f"Loading student config from {args.student_config}")
@@ -288,7 +288,7 @@ def main():

    if args.n_gpu > 0:
        student.to(f"cuda:{args.local_rank}")
-    logger.info(f"Student loaded.")
+    logger.info("Student loaded.")

    # TEACHER #
    teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
--- a/examples/glue/README.md
+++ b/examples/glue/README.md
@@ -1,9 +0,0 @@
-# GLUE Benchmark
-
-Based on the script [`run_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py).
-
-#### Run PyTorch version using PyTorch-Lightning
-
-Run `bash run_pl.sh` from the `glue` directory. This will also install `pytorch-lightning` and the requirements in `examples/requirements.txt`. It is a shell pipeline that will automatically download, pre-process the data and run the specified models. Logs are saved in `lightning_logs` directory.
-
-Pass `--n_gpu` flag to change the number of GPUs. Default uses 1. At the end, the expected results are: `TEST RESULTS {'val_loss': tensor(0.0707), 'precision': 0.852427800698191, 'recall': 0.869537067011978, 'f1': 0.8608974358974358}`
--- a/examples/language-modeling/README.md
+++ b/examples/language-modeling/README.md
@@ -0,0 +1,62 @@
+
+## Language model training
+
+Based on the script [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py).
+
+Fine-tuning (or training from scratch) the library models for language modeling on a text dataset for GPT, GPT-2, BERT, DistilBERT and RoBERTa. GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT, DistilBERT and RoBERTa
+are fine-tuned using a masked language modeling (MLM) loss.
+
+Before running the following example, you should get a file that contains text on which the language model will be
+trained or fine-tuned. A good example of such text is the [WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
+
+We will refer to two different files: `$TRAIN_FILE`, which contains text for training, and `$TEST_FILE`, which contains
+text that will be used for evaluation.
+
+### GPT-2/GPT and causal language modeling
+
+The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before
+the tokenization). The loss here is that of causal language modeling.
+
+```bash
+export TRAIN_FILE=/path/to/dataset/wiki.train.raw
+export TEST_FILE=/path/to/dataset/wiki.test.raw
+
+python run_language_modeling.py \
+    --output_dir=output \
+    --model_type=gpt2 \
+    --model_name_or_path=gpt2 \
+    --do_train \
+    --train_data_file=$TRAIN_FILE \
+    --do_eval \
+    --eval_data_file=$TEST_FILE
+```
+
+This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run. It reaches
+a score of ~20 perplexity once fine-tuned on the dataset.
+
+### RoBERTa/BERT/DistilBERT and masked language modeling
+
+The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different
+as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their
+pre-training: masked language modeling.
+
+In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may, therefore, converge
+slightly slower (over-fitting takes more epochs).
+
+We use the `--mlm` flag so that the script may change its loss function.
+
+```bash
+export TRAIN_FILE=/path/to/dataset/wiki.train.raw
+export TEST_FILE=/path/to/dataset/wiki.test.raw
+
+python run_language_modeling.py \
+    --output_dir=output \
+    --model_type=roberta \
+    --model_name_or_path=roberta-base \
+    --do_train \
+    --train_data_file=$TRAIN_FILE \
+    --do_eval \
+    --eval_data_file=$TEST_FILE \
+    --mlm
+```
+
--- a/examples/language-modeling/run_language_modeling.py
+++ b/examples/language-modeling/run_language_modeling.py
@@ -0,0 +1,281 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
+GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
+using a masked language modeling (MLM) loss.
+"""
+
+
+import logging
+import math
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_WITH_LM_HEAD_MAPPING,
+    AutoConfig,
+    AutoModelWithLMHead,
+    AutoTokenizer,
+    DataCollatorForLanguageModeling,
+    HfArgumentParser,
+    LineByLineTextDataset,
+    PreTrainedTokenizer,
+    TextDataset,
+    Trainer,
+    TrainingArguments,
+    set_seed,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    train_data_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a text file)."}
+    )
+    eval_data_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+
+    mlm: bool = field(
+        default=False, metadata={"help": "Train with masked-language modeling loss instead of language modeling."}
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+
+    block_size: int = field(
+        default=-1,
+        metadata={
+            "help": "Optional input sequence length after tokenization."
+            "The training dataset will be truncated in block of this size for training."
+            "Default to the model max input length for single sentence inputs (take into account special tokens)."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+
+
+def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False):
+    file_path = args.eval_data_file if evaluate else args.train_data_file
+    if args.line_by_line:
+        return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
+    else:
+        return TextDataset(
+            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache
+        )
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if data_args.eval_data_file is None and training_args.do_eval:
+        raise ValueError(
+            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
+            "or remove the --do_eval argument."
+        )
+
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
+        )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        training_args.local_rank,
+        training_args.device,
+        training_args.n_gpu,
+        bool(training_args.local_rank != -1),
+        training_args.fp16,
+    )
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed
+    set_seed(training_args.seed)
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
+            "and load it from here, using --tokenizer_name"
+        )
+
+    if model_args.model_name_or_path:
+        model = AutoModelWithLMHead.from_pretrained(
+            model_args.model_name_or_path,
+            from_tf=bool(".ckpt" in model_args.model_name_or_path),
+            config=config,
+            cache_dir=model_args.cache_dir,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelWithLMHead.from_config(config)
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm:
+        raise ValueError(
+            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
+            "flag (masked language modeling)."
+        )
+
+    if data_args.block_size <= 0:
+        data_args.block_size = tokenizer.max_len
+        # Our input block size will be the max possible for the model
+    else:
+        data_args.block_size = min(data_args.block_size, tokenizer.max_len)
+
+    # Get datasets
+
+    train_dataset = get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None
+    eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
+    )
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        data_collator=data_collator,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        prediction_loss_only=True,
+    )
+
+    # Training
+    if training_args.do_train:
+        model_path = (
+            model_args.model_name_or_path
+            if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)
+            else None
+        )
+        trainer.train(model_path=model_path)
+        trainer.save_model()
+        # For convenience, we also re-save the tokenizer to the same directory,
+        # so that you can share your model easily on huggingface.co/models =)
+        if trainer.is_world_master():
+            tokenizer.save_pretrained(training_args.output_dir)
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        eval_output = trainer.evaluate()
+
+        perplexity = math.exp(eval_output["eval_loss"])
+        result = {"perplexity": perplexity}
+
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
+        if trainer.is_world_master():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key in sorted(result.keys()):
+                    logger.info("  %s = %s", key, str(result[key]))
+                    writer.write("%s = %s\n" % (key, str(result[key])))
+
+        results.update(result)
+
+    return results
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/transformer_base.py
+++ b/examples/transformer_base.py
@@ -1,3 +1,4 @@
+import argparse
 import logging
 import os
 import random
@@ -7,7 +8,6 @@ import pytorch_lightning as pl
 import torch

 from transformers import (
-    ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
    AdamW,
    AutoConfig,
    AutoModel,
@@ -19,15 +19,11 @@ from transformers import (
    AutoTokenizer,
    get_linear_schedule_with_warmup,
 )
-from transformers.modeling_auto import MODEL_MAPPING


 logger = logging.getLogger(__name__)


-ALL_MODELS = tuple(ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
-MODEL_CLASSES = tuple(m.model_type for m in MODEL_MAPPING)
-
 MODEL_MODES = {
    "base": AutoModel,
    "sequence-classification": AutoModelForSequenceClassification,
@@ -38,7 +34,7 @@ MODEL_MODES = {
 }


-def set_seed(args):
+def set_seed(args: argparse.Namespace):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
@@ -47,30 +43,28 @@ def set_seed(args):


 class BaseTransformer(pl.LightningModule):
-    def __init__(self, hparams, num_labels=None, mode="base"):
+    def __init__(self, hparams: argparse.Namespace, num_labels=None, mode="base", **config_kwargs):
        "Initialize a model."

-        super(BaseTransformer, self).__init__()
+        super().__init__()
        self.hparams = hparams
-        self.hparams.model_type = self.hparams.model_type.lower()
-
-        config = AutoConfig.from_pretrained(
+        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
+        self.config = AutoConfig.from_pretrained(
            self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path,
-            num_labels=num_labels,
-            cache_dir=self.hparams.cache_dir if self.hparams.cache_dir else None,
+            **({"num_labels": num_labels} if num_labels is not None else {}),
+            cache_dir=cache_dir,
+            **config_kwargs,
        )
-        tokenizer = AutoTokenizer.from_pretrained(
+        self.tokenizer = AutoTokenizer.from_pretrained(
            self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path,
-            do_lower_case=self.hparams.do_lower_case,
-            cache_dir=self.hparams.cache_dir if self.hparams.cache_dir else None,
+            cache_dir=cache_dir,
        )
-        model = MODEL_MODES[mode].from_pretrained(
+        self.model = MODEL_MODES[mode].from_pretrained(
            self.hparams.model_name_or_path,
            from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
-            config=config,
-            cache_dir=self.hparams.cache_dir if self.hparams.cache_dir else None,
+            config=self.config,
+            cache_dir=cache_dir,
        )
-        self.config, self.tokenizer, self.model = config, tokenizer, model

    def is_logger(self):
        return self.trainer.proc_rank <= 0
@@ -103,8 +97,8 @@ class BaseTransformer(pl.LightningModule):
        self.lr_scheduler.step()

    def get_tqdm_dict(self):
-        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}
-
+        avg_loss = getattr(self.trainer, "avg_loss", 0.0)
+        tqdm_dict = {"loss": "{:.3f}".format(avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}
        return tqdm_dict

    def test_step(self, batch, batch_nb):
@@ -146,19 +140,12 @@ class BaseTransformer(pl.LightningModule):

    @staticmethod
    def add_model_specific_args(parser, root_dir):
-        parser.add_argument(
-            "--model_type",
-            default=None,
-            type=str,
-            required=True,
-            help="Model type selected in the list: " + ", ".join(MODEL_CLASSES),
-        )
        parser.add_argument(
            "--model_name_or_path",
            default=None,
            type=str,
            required=True,
-            help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+            help="Path to pretrained model or model identifier from huggingface.co/models",
        )
        parser.add_argument(
            "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
@@ -175,9 +162,6 @@ class BaseTransformer(pl.LightningModule):
            type=str,
            help="Where do you want to store the pre-trained models downloaded from s3",
        )
-        parser.add_argument(
-            "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-        )
        parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
        parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
        parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
@@ -191,7 +175,7 @@ class BaseTransformer(pl.LightningModule):


 class LoggingCallback(pl.Callback):
-    def on_validation_end(self, trainer, pl_module):
+    def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
@@ -200,7 +184,7 @@ class LoggingCallback(pl.Callback):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))

-    def on_test_end(self, trainer, pl_module):
+    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        logger.info("***** Test results *****")

        if pl_module.is_logger():
@@ -250,24 +234,13 @@ def add_generic_args(parser, root_dir):
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )

-    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")


-def generic_train(model, args):
+def generic_train(model: BaseTransformer, args: argparse.Namespace):
    # init model
    set_seed(args)

-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))

--- a/examples/movement-pruning/README.md
+++ b/examples/movement-pruning/README.md
@@ -0,0 +1,183 @@
+# Movement Pruning: Adaptive Sparsity by Fine-Tuning
+
+*Magnitude pruning is a widely used strategy for reducing model size in pure supervised learning; however, it is less effective in the transfer learning regime that has become standard for state-of-the-art natural language processing applications. We propose the use of *movement pruning*, a simple, deterministic first-order weight pruning method that is more adaptive to pretrained model fine-tuning. Experiments show that when pruning large pretrained language models, movement pruning shows significant improvements in high-sparsity regimes. When combined with distillation, the approach achieves minimal accuracy loss with down to only 3% of the model parameters:*
+
+| Fine-pruning+Distillation<br>(Teacher=BERT-base fine-tuned) | BERT base<br>fine-tuned | Remaining<br>Weights (%) | Magnitude Pruning      | L0 Regularization      | Movement Pruning       | Soft Movement Pruning          |
+| :---:                                                       | :---:                   | :---:                    | :---:                  | :---:                  | :---:                  | :---:                          |
+| SQuAD - Dev<br>EM/F1                                        | 80.4/88.1               | 10%<br>3%                | 70.2/80.1<br>45.5/59.6 | 72.4/81.9<br>64.3/75.8 | 75.6/84.3<br>67.5/78.0 | **76.6/84.9**<br>**72.7/82.3** |
+| MNLI - Dev<br>acc/MM acc                                    | 84.5/84.9               | 10%<br>3%                | 78.3/79.3<br>69.4/70.6 | 78.7/79.7<br>76.0/76.2 | 80.1/80.4<br>76.5/77.4 | **81.2/81.8**<br>**79.5/80.1** |
+| QQP - Dev<br>acc/F1                                         | 91.4/88.4               | 10%<br>3%                | 79.8/65.0<br>72.4/57.8 | 88.1/82.8<br>87.0/81.9 | 89.7/86.2<br>86.1/81.5 | **90.2/86.8**<br>**89.1/85.5** |
+
+This page contains information on how to fine-prune pre-trained models such as `BERT` to obtain extremely sparse models with movement pruning. In contrast to magnitude pruning which selects weights that are far from 0, movement pruning retains weights that are moving away from 0.
+
+For more information, we invite you to check out [our paper](https://arxiv.org/abs/2005.07683).
+You can also have a look at this fun *Explain Like I'm Five* introductory [slide deck](https://www.slideshare.net/VictorSanh/movement-pruning-explain-like-im-five-234205241).
+
+<div align="center">
+<img src="https://www.seekpng.com/png/detail/166-1669328_how-to-make-emmental-cheese-at-home-icooker.png" width="400">
+</div>
+
+## Extreme sparsity and efficient storage
+
+One promise of extreme pruning is to obtain extremely small models that can be easily sent (and stored) on edge devices. By setting weights to 0., we reduce the amount of information we need to store, and thus decreasing the memory size. We are able to obtain extremely sparse fine-pruned models with movement pruning: ~95% of the dense performance with ~5% of total remaining weights in the BERT encoder.
+
+In [this notebook](https://github.com/huggingface/transformers/blob/master/examples/movement-pruning/Saving_PruneBERT.ipynb), we showcase how we can leverage standard tools that exist out-of-the-box to efficiently store an extremely sparse question answering model (only 6% of total remaining weights in the encoder). We are able to reduce the memory size of the encoder **from the 340MB (the orignal dense BERT) to 11MB**, without any additional training of the model (every operation is performed *post fine-pruning*). It is sufficiently small to store it on a [91' floppy disk](https://en.wikipedia.org/wiki/Floptical) 📎!
+
+While movement pruning does not directly optimize for memory footprint (but rather the number of non-null weights), we hypothetize that further memory compression ratios can be achieved with specific quantization aware trainings (see for instance [Q8BERT](https://arxiv.org/abs/1910.06188), [And the Bit Goes Down](https://arxiv.org/abs/1907.05686) or [Quant-Noise](https://arxiv.org/abs/2004.07320)).
+
+## Fine-pruned models
+
+As examples, we release two English PruneBERT checkpoints (models fine-pruned from a pre-trained `BERT` checkpoint), one on SQuAD and the other on MNLI.
+
+- **`prunebert-base-uncased-6-finepruned-w-distil-squad`**<br/>
+Pre-trained `BERT-base-uncased` fine-pruned with soft movement pruning on SQuAD v1.1. We use an additional distillation signal from `BERT-base-uncased` finetuned on SQuAD. The encoder counts 6% of total non-null weights and reaches 83.8 F1 score. The model can be accessed with: `pruned_bert = BertForQuestionAnswering.from_pretrained("huggingface/prunebert-base-uncased-6-finepruned-w-distil-squad")`
+- **`prunebert-base-uncased-6-finepruned-w-distil-mnli`**<br/>
+Pre-trained `BERT-base-uncased` fine-pruned with soft movement pruning on MNLI. We use an additional distillation signal from `BERT-base-uncased` finetuned on MNLI. The encoder counts 6% of total non-null weights and reaches 80.7 (matched) accuracy. The model can be accessed with: `pruned_bert = BertForSequenceClassification.from_pretrained("huggingface/prunebert-base-uncased-6-finepruned-w-distil-mnli")`
+
+## How to fine-prune?
+
+### Setup
+
+The code relies on the 🤗 Transformers library. In addition to the dependencies listed in the [`examples`](https://github.com/huggingface/transformers/tree/master/examples) folder, you should install a few additional dependencies listed in the `requirements.txt` file: `pip install -r requirements.txt`.
+
+Note that we built our experiments on top of a stabilized version of the library (commit https://github.com/huggingface/transformers/commit/352d5472b0c1dec0f420d606d16747d851b4bda8): we do not guarantee that everything is still compatible with the latest version of the master branch.
+
+### Fine-pruning with movement pruning
+
+Below, we detail how to reproduce the results reported in the paper. We use SQuAD as a running example. Commands (and scripts) can be easily adapted for other tasks.
+
+The following command fine-prunes a pre-trained `BERT-base` on SQuAD using movement pruning towards 15% of remaining weights (85% sparsity). Note that we freeze all the embeddings modules (from their pre-trained value) and only prune the Fully Connected layers in the encoder (12 layers of Transformer Block).
+
+```bash
+SERIALIZATION_DIR=<OUTPUT_DIR>
+SQUAD_DATA=<SQUAD_DATA>
+
+python examples/movement-pruning/masked_run_squad.py \
+    --output_dir $SERIALIZATION_DIR \
+    --data_dir $SQUAD_DATA \
+    --train_file train-v1.1.json \
+    --predict_file dev-v1.1.json \
+    --do_train --do_eval --do_lower_case \
+    --model_type masked_bert \
+    --model_name_or_path bert-base-uncased \
+    --per_gpu_train_batch_size 16 \
+    --warmup_steps 5400 \
+    --num_train_epochs 10 \
+    --learning_rate 3e-5 --mask_scores_learning_rate 1e-2 \
+    --initial_threshold 1 --final_threshold 0.15 \
+    --initial_warmup 1 --final_warmup 2 \
+    --pruning_method topK --mask_init constant --mask_scale 0.
+```
+
+### Fine-pruning with other methods
+
+We can also explore other fine-pruning methods by changing the `pruning_method` parameter:
+
+Soft movement pruning
+```bash
+python examples/movement-pruning/masked_run_squad.py \
+    --output_dir $SERIALIZATION_DIR \
+    --data_dir $SQUAD_DATA \
+    --train_file train-v1.1.json \
+    --predict_file dev-v1.1.json \
+    --do_train --do_eval --do_lower_case \
+    --model_type masked_bert \
+    --model_name_or_path bert-base-uncased \
+    --per_gpu_train_batch_size 16 \
+    --warmup_steps 5400 \
+    --num_train_epochs 10 \
+    --learning_rate 3e-5 --mask_scores_learning_rate 1e-2 \
+    --initial_threshold 0 --final_threshold 0.1 \
+    --initial_warmup 1 --final_warmup 2 \
+    --pruning_method sigmoied_threshold --mask_init constant --mask_scale 0. \
+    --regularization l1 --final_lambda 400.
+```
+
+L0 regularization
+```bash
+python examples/movement-pruning/masked_run_squad.py \
+    --output_dir $SERIALIZATION_DIR \
+    --data_dir $SQUAD_DATA \
+    --train_file train-v1.1.json \
+    --predict_file dev-v1.1.json \
+    --do_train --do_eval --do_lower_case \
+    --model_type masked_bert \
+    --model_name_or_path bert-base-uncased \
+    --per_gpu_train_batch_size 16 \
+    --warmup_steps 5400 \
+    --num_train_epochs 10 \
+    --learning_rate 3e-5 --mask_scores_learning_rate 1e-1 \
+    --initial_threshold 1. --final_threshold 1. \
+    --initial_warmup 1 --final_warmup 1 \
+    --pruning_method l0 --mask_init constant --mask_scale 2.197 \
+    --regularization l0 --final_lambda 125.
+```
+
+Iterative Magnitude Pruning
+```bash
+python examples/movement-pruning/masked_run_squad.py \
+    --output_dir ./dbg \
+    --data_dir examples/distillation/data/squad_data \
+    --train_file train-v1.1.json \
+    --predict_file dev-v1.1.json \
+    --do_train --do_eval --do_lower_case \
+    --model_type masked_bert \
+    --model_name_or_path bert-base-uncased \
+    --per_gpu_train_batch_size 16 \
+    --warmup_steps 5400 \
+    --num_train_epochs 10 \
+    --learning_rate 3e-5 \
+    --initial_threshold 1 --final_threshold 0.15 \
+    --initial_warmup 1 --final_warmup 2 \
+    --pruning_method magnitude
+```
+
+### After fine-pruning
+
+**Counting parameters**
+
+Regularization based pruning methods (soft movement pruning and L0 regularization) rely on the penalty to induce sparsity. The multiplicative coefficient controls the sparsity level.
+To obtain the effective sparsity level in the encoder, we simply count the number of activated (non-null) weights:
+
+```bash
+python examples/movement-pruning/count_parameters.py \
+    --pruning_method sigmoied_threshold \
+    --threshold 0.1 \
+    --serialization_dir $SERIALIZATION_DIR
+```
+
+**Pruning once for all**
+
+Once the model has been fine-pruned, the pruned weights can be set to 0. once for all (reducing the amount of information to store). In our running experiments, we can convert a `MaskedBertForQuestionAnswering` (a BERT model augmented to enable on-the-fly pruning capabilities) to a standard `BertForQuestionAnswering`:
+
+```bash
+python examples/movement-pruning/bertarize.py \
+    --pruning_method sigmoied_threshold \
+    --threshold 0.1 \
+    --model_name_or_path $SERIALIZATION_DIR
+```
+
+## Hyper-parameters
+
+For reproducibility purposes, we share the detailed results presented in the paper. These [tables](https://docs.google.com/spreadsheets/d/17JgRq_OFFTniUrz6BZWW_87DjFkKXpI1kYDSsseT_7g/edit?usp=sharing) exhaustively describe the individual hyper-parameters used for each data point.
+
+## Inference speed
+
+Early experiments show that even though models fine-pruned with (soft) movement pruning are extremely sparse, they do not benefit from significant improvement in terms of inference speed when using the standard PyTorch inference.
+We are currently benchmarking and exploring inference setups specifically for sparse architectures.
+In particular, hardware manufacturers are announcing devices that will speedup inference for sparse networks considerably.
+
+## Citation
+
+If you find this resource useful, please consider citing the following paper:
+
+```
+@article{sanh2020movement,
+    title={Movement Pruning: Adaptive Sparsity by Fine-Tuning},
+    author={Victor Sanh and Thomas Wolf and Alexander M. Rush},
+    year={2020},
+    eprint={2005.07683},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
--- a/examples/movement-pruning/Saving_PruneBERT.ipynb
+++ b/examples/movement-pruning/Saving_PruneBERT.ipynb
@@ -0,0 +1,612 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Saving PruneBERT\n",
+    "\n",
+    "\n",
+    "This notebook aims at showcasing how we can leverage standard tools to save (and load) an extremely sparse model fine-pruned with [movement pruning](https://arxiv.org/abs/2005.07683) (or any other unstructured pruning mehtod).\n",
+    "\n",
+    "In this example, we used BERT (base-uncased, but the procedure described here is not specific to BERT and can be applied to a large variety of models.\n",
+    "\n",
+    "We first obtain an extremely sparse model by fine-pruning with movement pruning on SQuAD v1.1. We then used the following combination of standard tools:\n",
+    "- We reduce the precision of the model with Int8 dynamic quantization using [PyTorch implementation](https://pytorch.org/tutorials/intermediate/dynamic_quantization_bert_tutorial.html). We only quantized the Fully Connected Layers.\n",
+    "- Sparse quantized matrices are converted into the [Compressed Sparse Row format](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html).\n",
+    "- We use HDF5 with `gzip` compression to store the weights.\n",
+    "\n",
+    "We experiment with a question answering model with only 6% of total remaining weights in the encoder (previously obtained with movement pruning). **We are able to reduce the memory size of the encoder from 340MB (original dense BERT) to 11MB**, which fits on a [91' floppy disk](https://en.wikipedia.org/wiki/Floptical)!\n",
+    "\n",
+    "<img src=\"https://upload.wikimedia.org/wikipedia/commons/thumb/0/00/Floptical_disk_21MB.jpg/440px-Floptical_disk_21MB.jpg\" width=\"200\">"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Includes\n",
+    "\n",
+    "import h5py\n",
+    "import os\n",
+    "import json\n",
+    "from collections import OrderedDict\n",
+    "\n",
+    "from scipy import sparse\n",
+    "import numpy as np\n",
+    "\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "\n",
+    "from transformers import *\n",
+    "\n",
+    "os.chdir('../../')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Saving"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Dynamic quantization induces little or no loss of performance while significantly reducing the memory footprint."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load fine-pruned model and quantize the model\n",
+    "\n",
+    "model_path = \"serialization_dir/bert-base-uncased/92/squad/l1\"\n",
+    "model_name = \"bertarized_l1_with_distil_0._0.1_1_2_l1_1100._3e-5_1e-2_sigmoied_threshold_constant_0._10_epochs\"\n",
+    "\n",
+    "model = BertForQuestionAnswering.from_pretrained(os.path.join(model_path, model_name))\n",
+    "model.to('cpu')\n",
+    "\n",
+    "quantized_model = torch.quantization.quantize_dynamic(\n",
+    "                    model=model,\n",
+    "                    qconfig_spec = {\n",
+    "                        torch.nn.Linear : torch.quantization.default_dynamic_qconfig,\n",
+    "                    },\n",
+    "                    dtype=torch.qint8,\n",
+    "                )\n",
+    "# print(quantized_model)\n",
+    "\n",
+    "qtz_st = quantized_model.state_dict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Saving the original (encoder + classifier) in the standard torch.save format\n",
+    "\n",
+    "dense_st = {name: param for name, param in model.state_dict().items() \n",
+    "                            if \"embedding\" not in name and \"pooler\" not in name}\n",
+    "torch.save(dense_st, 'dbg/dense_squad.pt',)\n",
+    "dense_mb_size = os.path.getsize(\"dbg/dense_squad.pt\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Decompose quantization for bert.encoder.layer.0.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.0.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.0.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.0.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.0.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.0.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.1.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.1.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.1.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.1.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.1.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.1.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.2.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.2.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.2.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.2.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.2.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.2.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.3.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.3.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.3.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.3.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.3.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.3.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.4.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.4.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.4.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.4.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.4.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.4.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.5.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.5.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.5.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.5.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.5.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.5.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.6.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.6.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.6.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.6.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.6.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.6.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.7.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.7.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.7.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.7.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.7.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.7.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.8.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.8.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.8.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.8.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.8.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.8.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.9.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.9.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.9.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.9.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.9.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.9.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.10.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.10.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.10.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.10.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.10.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.10.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.11.attention.self.query._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.11.attention.self.key._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.11.attention.self.value._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.11.attention.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.11.intermediate.dense._packed_params.weight\n",
+      "Decompose quantization for bert.encoder.layer.11.output.dense._packed_params.weight\n",
+      "Decompose quantization for bert.pooler.dense._packed_params.weight\n",
+      "Decompose quantization for qa_outputs._packed_params.weight\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Elementary representation: we decompose the quantized tensors into (scale, zero_point, int_repr).\n",
+    "# See https://pytorch.org/docs/stable/quantization.html\n",
+    "\n",
+    "# We further leverage the fact that int_repr is sparse matrix to optimize the storage: we decompose int_repr into\n",
+    "# its CSR representation (data, indptr, indices).\n",
+    "\n",
+    "elementary_qtz_st = {}\n",
+    "for name, param in qtz_st.items():\n",
+    "    if param.is_quantized:\n",
+    "        print(\"Decompose quantization for\", name)\n",
+    "        # We need to extract the scale, the zero_point and the int_repr for the quantized tensor and modules\n",
+    "        scale = param.q_scale()                                # torch.tensor(1,) - float32\n",
+    "        zero_point = param.q_zero_point()                      # torch.tensor(1,) - int32\n",
+    "        elementary_qtz_st[f\"{name}.scale\"] = scale\n",
+    "        elementary_qtz_st[f\"{name}.zero_point\"] = zero_point\n",
+    "\n",
+    "        # We assume the int_repr is sparse and compute its CSR representation\n",
+    "        # Only the FCs in the encoder are actually sparse\n",
+    "        int_repr = param.int_repr()                         # torch.tensor(nb_rows, nb_columns) - int8\n",
+    "        int_repr_cs = sparse.csr_matrix(int_repr)           # scipy.sparse.csr.csr_matrix\n",
+    "\n",
+    "        elementary_qtz_st[f\"{name}.int_repr.data\"] = int_repr_cs.data                  # np.array int8\n",
+    "        elementary_qtz_st[f\"{name}.int_repr.indptr\"] = int_repr_cs.indptr              # np.array int32\n",
+    "        assert max(int_repr_cs.indices) < 65535 # If not, we shall fall back to int32\n",
+    "        elementary_qtz_st[f\"{name}.int_repr.indices\"] = np.uint16(int_repr_cs.indices) # np.array uint16\n",
+    "        elementary_qtz_st[f\"{name}.int_repr.shape\"] = int_repr_cs.shape                # tuple(int, int)\n",
+    "    else:\n",
+    "        elementary_qtz_st[name] = param\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Encoder Size (MB) - Sparse & Quantized - `torch.save`: 21.29\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Saving the pruned (encoder + classifier) in the standard torch.save format\n",
+    "\n",
+    "dense_optimized_st = {name: param for name, param in elementary_qtz_st.items() \n",
+    "                                    if \"embedding\" not in name and \"pooler\" not in name}\n",
+    "torch.save(dense_optimized_st, 'dbg/dense_squad_optimized.pt',)\n",
+    "print(\"Encoder Size (MB) - Sparse & Quantized - `torch.save`:\",\n",
+    "      round(os.path.getsize(\"dbg/dense_squad_optimized.pt\")/1e6, 2))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Skip bert.embeddings.word_embeddings.weight\n",
+      "Skip bert.embeddings.position_embeddings.weight\n",
+      "Skip bert.embeddings.token_type_embeddings.weight\n",
+      "Skip bert.embeddings.LayerNorm.weight\n",
+      "Skip bert.embeddings.LayerNorm.bias\n",
+      "Skip bert.pooler.dense.scale\n",
+      "Skip bert.pooler.dense.zero_point\n",
+      "Skip bert.pooler.dense._packed_params.weight.scale\n",
+      "Skip bert.pooler.dense._packed_params.weight.zero_point\n",
+      "Skip bert.pooler.dense._packed_params.weight.int_repr.data\n",
+      "Skip bert.pooler.dense._packed_params.weight.int_repr.indptr\n",
+      "Skip bert.pooler.dense._packed_params.weight.int_repr.indices\n",
+      "Skip bert.pooler.dense._packed_params.weight.int_repr.shape\n",
+      "Skip bert.pooler.dense._packed_params.bias\n",
+      "\n",
+      "Encoder Size (MB) - Dense:              340.25\n",
+      "Encoder Size (MB) - Sparse & Quantized: 11.27\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Save the decomposed state_dict with an HDF5 file\n",
+    "# Saving only the encoder + QA Head\n",
+    "\n",
+    "with h5py.File('dbg/squad_sparse.h5','w') as hf:\n",
+    "    for name, param in elementary_qtz_st.items():\n",
+    "        if \"embedding\" in name:\n",
+    "            print(f\"Skip {name}\")\n",
+    "            continue\n",
+    "\n",
+    "        if \"pooler\" in name:\n",
+    "            print(f\"Skip {name}\")\n",
+    "            continue\n",
+    "\n",
+    "        if type(param) == torch.Tensor:\n",
+    "            if param.numel() == 1:\n",
+    "                # module scale\n",
+    "                # module zero_point\n",
+    "                hf.attrs[name] = param\n",
+    "                continue\n",
+    "\n",
+    "            if param.requires_grad:\n",
+    "                # LayerNorm\n",
+    "                param = param.detach().numpy()\n",
+    "            hf.create_dataset(name, data=param, compression=\"gzip\", compression_opts=9)\n",
+    "\n",
+    "        elif type(param) == float or type(param) == int or type(param) == tuple:\n",
+    "            # float - tensor _packed_params.weight.scale\n",
+    "            # int   - tensor_packed_params.weight.zero_point\n",
+    "            # tuple - tensor _packed_params.weight.shape\n",
+    "            hf.attrs[name] = param\n",
+    "\n",
+    "        else:\n",
+    "            hf.create_dataset(name, data=param, compression=\"gzip\", compression_opts=9)\n",
+    "\n",
+    "\n",
+    "with open('dbg/metadata.json', 'w') as f:\n",
+    "    f.write(json.dumps(qtz_st._metadata))  \n",
+    "\n",
+    "size = os.path.getsize(\"dbg/squad_sparse.h5\") + os.path.getsize(\"dbg/metadata.json\")\n",
+    "print(\"\")\n",
+    "print(\"Encoder Size (MB) - Dense:             \", round(dense_mb_size/1e6, 2))\n",
+    "print(\"Encoder Size (MB) - Sparse & Quantized:\", round(size/1e6, 2))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Size (MB): 99.39\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Save the decomposed state_dict to HDF5 storage\n",
+    "# Save everything in the architecutre (embedding + encoder + QA Head)\n",
+    "\n",
+    "with h5py.File('dbg/squad_sparse_with_embs.h5','w') as hf:\n",
+    "    for name, param in elementary_qtz_st.items():\n",
+    "#         if \"embedding\" in name:\n",
+    "#             print(f\"Skip {name}\")\n",
+    "#             continue\n",
+    "\n",
+    "#         if \"pooler\" in name:\n",
+    "#             print(f\"Skip {name}\")\n",
+    "#             continue\n",
+    "\n",
+    "        if type(param) == torch.Tensor:\n",
+    "            if param.numel() == 1:\n",
+    "                # module scale\n",
+    "                # module zero_point\n",
+    "                hf.attrs[name] = param\n",
+    "                continue\n",
+    "\n",
+    "            if param.requires_grad:\n",
+    "                # LayerNorm\n",
+    "                param = param.detach().numpy()\n",
+    "            hf.create_dataset(name, data=param, compression=\"gzip\", compression_opts=9)\n",
+    "\n",
+    "        elif type(param) == float or type(param) == int or type(param) == tuple:\n",
+    "            # float - tensor _packed_params.weight.scale\n",
+    "            # int   - tensor _packed_params.weight.zero_point\n",
+    "            # tuple - tensor _packed_params.weight.shape\n",
+    "            hf.attrs[name] = param\n",
+    "\n",
+    "        else:\n",
+    "            hf.create_dataset(name, data=param, compression=\"gzip\", compression_opts=9)\n",
+    "\n",
+    "\n",
+    "with open('dbg/metadata.json', 'w') as f:\n",
+    "    f.write(json.dumps(qtz_st._metadata))   \n",
+    "\n",
+    "size = os.path.getsize(\"dbg/squad_sparse_with_embs.h5\") + os.path.getsize(\"dbg/metadata.json\")\n",
+    "print('\\nSize (MB):', round(size/1e6, 2))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reconstruct the elementary state dict\n",
+    "\n",
+    "reconstructed_elementary_qtz_st = {}\n",
+    "\n",
+    "hf = h5py.File('dbg/squad_sparse_with_embs.h5','r')\n",
+    "\n",
+    "for attr_name, attr_param in hf.attrs.items():\n",
+    "    if 'shape' in attr_name:\n",
+    "        attr_param = tuple(attr_param)\n",
+    "    elif \".scale\" in attr_name:\n",
+    "        if \"_packed_params\" in attr_name:\n",
+    "            attr_param = float(attr_param)\n",
+    "        else:\n",
+    "            attr_param = torch.tensor(attr_param)\n",
+    "    elif \".zero_point\" in attr_name:\n",
+    "        if \"_packed_params\" in attr_name:\n",
+    "            attr_param = int(attr_param)\n",
+    "        else:\n",
+    "            attr_param = torch.tensor(attr_param)\n",
+    "    reconstructed_elementary_qtz_st[attr_name] = attr_param\n",
+    "    # print(f\"Unpack {attr_name}\")\n",
+    "    \n",
+    "# Get the tensors/arrays\n",
+    "for data_name, data_param in hf.items():\n",
+    "    if \"LayerNorm\" in data_name or \"_packed_params.bias\" in data_name:\n",
+    "        reconstructed_elementary_qtz_st[data_name] = torch.from_numpy(np.array(data_param))\n",
+    "    elif \"embedding\" in data_name:\n",
+    "        reconstructed_elementary_qtz_st[data_name] = torch.from_numpy(np.array(data_param))\n",
+    "    else: # _packed_params.weight.int_repr.data, _packed_params.weight.int_repr.indices and _packed_params.weight.int_repr.indptr\n",
+    "        data_param = np.array(data_param)\n",
+    "        if \"indices\" in data_name:\n",
+    "            data_param = np.array(data_param, dtype=np.int32)\n",
+    "        reconstructed_elementary_qtz_st[data_name] = data_param\n",
+    "    # print(f\"Unpack {data_name}\")\n",
+    "    \n",
+    "\n",
+    "hf.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sanity checks\n",
+    "\n",
+    "for name, param in reconstructed_elementary_qtz_st.items():\n",
+    "    assert name in elementary_qtz_st\n",
+    "for name, param in elementary_qtz_st.items():\n",
+    "    assert name in reconstructed_elementary_qtz_st, name\n",
+    "\n",
+    "for name, param in reconstructed_elementary_qtz_st.items():\n",
+    "    assert type(param) == type(elementary_qtz_st[name]), name\n",
+    "    if type(param) == torch.Tensor:\n",
+    "        assert torch.all(torch.eq(param, elementary_qtz_st[name])), name\n",
+    "    elif type(param) == np.ndarray:\n",
+    "        assert (param == elementary_qtz_st[name]).all(), name\n",
+    "    else:\n",
+    "        assert param == elementary_qtz_st[name], name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Re-assemble the sparse int_repr from the CSR format\n",
+    "\n",
+    "reconstructed_qtz_st = {}\n",
+    "\n",
+    "for name, param in reconstructed_elementary_qtz_st.items():\n",
+    "    if \"weight.int_repr.indptr\" in name:\n",
+    "        prefix_ = name[:-16]\n",
+    "        data    = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.data\"]\n",
+    "        indptr  = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.indptr\"]\n",
+    "        indices = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.indices\"]\n",
+    "        shape   = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.shape\"]\n",
+    "\n",
+    "        int_repr = sparse.csr_matrix(arg1=(data, indices, indptr),\n",
+    "                                     shape=shape)\n",
+    "        int_repr = torch.tensor(int_repr.todense())\n",
+    "\n",
+    "        scale = reconstructed_elementary_qtz_st[f\"{prefix_}.scale\"]\n",
+    "        zero_point = reconstructed_elementary_qtz_st[f\"{prefix_}.zero_point\"]\n",
+    "        weight = torch._make_per_tensor_quantized_tensor(int_repr,\n",
+    "                                                         scale,\n",
+    "                                                         zero_point)\n",
+    "\n",
+    "        reconstructed_qtz_st[f\"{prefix_}\"] = weight\n",
+    "    elif \"int_repr.data\" in name or \"int_repr.shape\" in name or \"int_repr.indices\" in name or \\\n",
+    "         \"weight.scale\" in name or \"weight.zero_point\" in name:\n",
+    "        continue\n",
+    "    else:\n",
+    "        reconstructed_qtz_st[name] = param\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Sanity checks\n",
+    "\n",
+    "for name, param in reconstructed_qtz_st.items():\n",
+    "    assert name in qtz_st\n",
+    "for name, param in qtz_st.items():\n",
+    "    assert name in reconstructed_qtz_st, name\n",
+    "\n",
+    "for name, param in reconstructed_qtz_st.items():\n",
+    "    assert type(param) == type(qtz_st[name]), name\n",
+    "    if type(param) == torch.Tensor:\n",
+    "        assert torch.all(torch.eq(param, qtz_st[name])), name\n",
+    "    elif type(param) == np.ndarray:\n",
+    "        assert (param == qtz_st[name]).all(), name\n",
+    "    else:\n",
+    "        assert param == qtz_st[name], name"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Sanity checks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<All keys matched successfully>"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Load the re-constructed state dict into a model\n",
+    "\n",
+    "dummy_model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')\n",
+    "dummy_model.to('cpu')\n",
+    "\n",
+    "reconstructed_qtz_model = torch.quantization.quantize_dynamic(\n",
+    "                            model=dummy_model,\n",
+    "                            qconfig_spec = None,\n",
+    "                            dtype=torch.qint8,\n",
+    "                          )\n",
+    "\n",
+    "reconstructed_qtz_st = OrderedDict(reconstructed_qtz_st)\n",
+    "with open('dbg/metadata.json', 'r') as read_file:\n",
+    "    metadata = json.loads(read_file.read())\n",
+    "reconstructed_qtz_st._metadata = metadata\n",
+    "\n",
+    "reconstructed_qtz_model.load_state_dict(reconstructed_qtz_st)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sanity check passed\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Sanity checks on the infernce\n",
+    "\n",
+    "N = 32\n",
+    "\n",
+    "for _ in range(25):\n",
+    "    inputs = torch.randint(low=0, high=30000, size=(N, 128))\n",
+    "    mask = torch.ones(size=(N, 128))\n",
+    "\n",
+    "    y_reconstructed = reconstructed_qtz_model(input_ids=inputs, attention_mask=mask)[0]\n",
+    "    y               = quantized_model(input_ids=inputs, attention_mask=mask)[0]\n",
+    "    \n",
+    "    assert torch.all(torch.eq(y, y_reconstructed))\n",
+    "print(\"Sanity check passed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/examples/movement-pruning/bertarize.py
+++ b/examples/movement-pruning/bertarize.py
@@ -0,0 +1,132 @@
+# Copyright 2020-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Once a model has been fine-pruned, the weights that are masked during the forward pass can be pruned once for all.
+For instance, once the a model from the :class:`~emmental.MaskedBertForSequenceClassification` is trained, it can be saved (and then loaded)
+as a standard :class:`~transformers.BertForSequenceClassification`.
+"""
+
+import argparse
+import os
+import shutil
+
+import torch
+
+from emmental.modules import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
+
+
+def main(args):
+    pruning_method = args.pruning_method
+    threshold = args.threshold
+
+    model_name_or_path = args.model_name_or_path.rstrip("/")
+    target_model_path = args.target_model_path
+
+    print(f"Load fine-pruned model from {model_name_or_path}")
+    model = torch.load(os.path.join(model_name_or_path, "pytorch_model.bin"))
+    pruned_model = {}
+
+    for name, tensor in model.items():
+        if "embeddings" in name or "LayerNorm" in name or "pooler" in name:
+            pruned_model[name] = tensor
+            print(f"Copied layer {name}")
+        elif "classifier" in name or "qa_output" in name:
+            pruned_model[name] = tensor
+            print(f"Copied layer {name}")
+        elif "bias" in name:
+            pruned_model[name] = tensor
+            print(f"Copied layer {name}")
+        else:
+            if pruning_method == "magnitude":
+                mask = MagnitudeBinarizer.apply(inputs=tensor, threshold=threshold)
+                pruned_model[name] = tensor * mask
+                print(f"Pruned layer {name}")
+            elif pruning_method == "topK":
+                if "mask_scores" in name:
+                    continue
+                prefix_ = name[:-6]
+                scores = model[f"{prefix_}mask_scores"]
+                mask = TopKBinarizer.apply(scores, threshold)
+                pruned_model[name] = tensor * mask
+                print(f"Pruned layer {name}")
+            elif pruning_method == "sigmoied_threshold":
+                if "mask_scores" in name:
+                    continue
+                prefix_ = name[:-6]
+                scores = model[f"{prefix_}mask_scores"]
+                mask = ThresholdBinarizer.apply(scores, threshold, True)
+                pruned_model[name] = tensor * mask
+                print(f"Pruned layer {name}")
+            elif pruning_method == "l0":
+                if "mask_scores" in name:
+                    continue
+                prefix_ = name[:-6]
+                scores = model[f"{prefix_}mask_scores"]
+                l, r = -0.1, 1.1
+                s = torch.sigmoid(scores)
+                s_bar = s * (r - l) + l
+                mask = s_bar.clamp(min=0.0, max=1.0)
+                pruned_model[name] = tensor * mask
+                print(f"Pruned layer {name}")
+            else:
+                raise ValueError("Unknown pruning method")
+
+    if target_model_path is None:
+        target_model_path = os.path.join(
+            os.path.dirname(model_name_or_path), f"bertarized_{os.path.basename(model_name_or_path)}"
+        )
+
+    if not os.path.isdir(target_model_path):
+        shutil.copytree(model_name_or_path, target_model_path)
+        print(f"\nCreated folder {target_model_path}")
+
+    torch.save(pruned_model, os.path.join(target_model_path, "pytorch_model.bin"))
+    print("\nPruned model saved! See you later!")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--pruning_method",
+        choices=["l0", "magnitude", "topK", "sigmoied_threshold"],
+        type=str,
+        required=True,
+        help="Pruning Method (l0 = L0 regularization, magnitude = Magnitude pruning, topK = Movement pruning, sigmoied_threshold = Soft movement pruning)",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        required=False,
+        help="For `magnitude` and `topK`, it is the level of remaining weights (in %) in the fine-pruned model."
+        "For `sigmoied_threshold`, it is the threshold \tau against which the (sigmoied) scores are compared."
+        "Not needed for `l0`",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        required=True,
+        help="Folder containing the model that was previously fine-pruned",
+    )
+    parser.add_argument(
+        "--target_model_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Folder containing the model that was previously fine-pruned",
+    )
+
+    args = parser.parse_args()
+
+    main(args)
--- a/examples/movement-pruning/counts_parameters.py
+++ b/examples/movement-pruning/counts_parameters.py
@@ -0,0 +1,92 @@
+# Copyright 2020-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Count remaining (non-zero) weights in the encoder (i.e. the transformer layers).
+Sparsity and remaining weights levels are equivalent: sparsity % = 100 - remaining weights %.
+"""
+import argparse
+import os
+
+import torch
+
+from emmental.modules import ThresholdBinarizer, TopKBinarizer
+
+
+def main(args):
+    serialization_dir = args.serialization_dir
+    pruning_method = args.pruning_method
+    threshold = args.threshold
+
+    st = torch.load(os.path.join(serialization_dir, "pytorch_model.bin"), map_location="cpu")
+
+    remaining_count = 0  # Number of remaining (not pruned) params in the encoder
+    encoder_count = 0  # Number of params in the encoder
+
+    print("name".ljust(60, " "), "Remaining Weights %", "Remaning Weight")
+    for name, param in st.items():
+        if "encoder" not in name:
+            continue
+
+        if "mask_scores" in name:
+            if pruning_method == "topK":
+                mask_ones = TopKBinarizer.apply(param, threshold).sum().item()
+            elif pruning_method == "sigmoied_threshold":
+                mask_ones = ThresholdBinarizer.apply(param, threshold, True).sum().item()
+            elif pruning_method == "l0":
+                l, r = -0.1, 1.1
+                s = torch.sigmoid(param)
+                s_bar = s * (r - l) + l
+                mask = s_bar.clamp(min=0.0, max=1.0)
+                mask_ones = (mask > 0.0).sum().item()
+            else:
+                raise ValueError("Unknown pruning method")
+            remaining_count += mask_ones
+            print(name.ljust(60, " "), str(round(100 * mask_ones / param.numel(), 3)).ljust(20, " "), str(mask_ones))
+        else:
+            encoder_count += param.numel()
+            if "bias" in name or "LayerNorm" in name:
+                remaining_count += param.numel()
+
+    print("")
+    print("Remaining Weights (global) %: ", 100 * remaining_count / encoder_count)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--pruning_method",
+        choices=["l0", "topK", "sigmoied_threshold"],
+        type=str,
+        required=True,
+        help="Pruning Method (l0 = L0 regularization, topK = Movement pruning, sigmoied_threshold = Soft movement pruning)",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        required=False,
+        help="For `topK`, it is the level of remaining weights (in %) in the fine-pruned model."
+        "For `sigmoied_threshold`, it is the threshold \tau against which the (sigmoied) scores are compared."
+        "Not needed for `l0`",
+    )
+    parser.add_argument(
+        "--serialization_dir",
+        type=str,
+        required=True,
+        help="Folder containing the model that was previously fine-pruned",
+    )
+
+    args = parser.parse_args()
+
+    main(args)
--- a/examples/movement-pruning/emmental/init.py
+++ b/examples/movement-pruning/emmental/init.py
@@ -0,0 +1,10 @@
+# flake8: noqa
+from .configuration_bert_masked import MaskedBertConfig
+from .modeling_bert_masked import (
+    MaskedBertForMultipleChoice,
+    MaskedBertForQuestionAnswering,
+    MaskedBertForSequenceClassification,
+    MaskedBertForTokenClassification,
+    MaskedBertModel,
+)
+from .modules import *
--- a/examples/movement-pruning/emmental/configuration_bert_masked.py
+++ b/examples/movement-pruning/emmental/configuration_bert_masked.py
@@ -0,0 +1,71 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Masked BERT model configuration. It replicates the class `~transformers.BertConfig`
+and adapts it to the specificities of MaskedBert (`pruning_method`, `mask_init` and `mask_scale`."""
+
+
+import logging
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+class MaskedBertConfig(PretrainedConfig):
+    """
+    A class replicating the `~transformers.BertConfig` with additional parameters for pruning/masking configuration.
+    """
+
+    model_type = "masked_bert"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        pruning_method="topK",
+        mask_init="constant",
+        mask_scale=0.0,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.pruning_method = pruning_method
+        self.mask_init = mask_init
+        self.mask_scale = mask_scale
--- a/examples/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/movement-pruning/emmental/modeling_bert_masked.py
--- a/examples/movement-pruning/emmental/modules/init.py
+++ b/examples/movement-pruning/emmental/modules/init.py
@@ -0,0 +1,3 @@
+# flake8: noqa
+from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
+from .masked_nn import MaskedLinear
--- a/examples/movement-pruning/emmental/modules/binarizer.py
+++ b/examples/movement-pruning/emmental/modules/binarizer.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2020-present, AllenAI Authors, University of Illinois Urbana-Champaign,
+# Intel Nervana Systems and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Binarizers take a (real value) matrice as input and produce a binary (values in {0,1}) mask of the same shape.
+"""
+
+import torch
+from torch import autograd
+
+
+class ThresholdBinarizer(autograd.Function):
+    """
+    Thresholdd binarizer.
+    Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j} > \tau`
+    where `\tau` is a real value threshold.
+
+    Implementation is inspired from:
+        https://github.com/arunmallya/piggyback
+        Piggyback: Adapting a Single Network to Multiple Tasks by Learning to Mask Weights
+        Arun Mallya, Dillon Davis, Svetlana Lazebnik
+    """
+
+    @staticmethod
+    def forward(ctx, inputs: torch.tensor, threshold: float, sigmoid: bool):
+        """
+        Args:
+            inputs (`torch.FloatTensor`)
+                The input matrix from which the binarizer computes the binary mask.
+            threshold (`float`)
+                The threshold value (in R).
+            sigmoid (`bool`)
+                If set to ``True``, we apply the sigmoid function to the `inputs` matrix before comparing to `threshold`.
+                In this case, `threshold` should be a value between 0 and 1.
+        Returns:
+            mask (`torch.FloatTensor`)
+                Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is
+                retained, 0 - the associated weight is pruned).
+        """
+        nb_elems = inputs.numel()
+        nb_min = int(0.005 * nb_elems) + 1
+        if sigmoid:
+            mask = (torch.sigmoid(inputs) > threshold).type(inputs.type())
+        else:
+            mask = (inputs > threshold).type(inputs.type())
+        if mask.sum() < nb_min:
+            # We limit the pruning so that at least 0.5% (half a percent) of the weights are remaining
+            k_threshold = inputs.flatten().kthvalue(max(nb_elems - nb_min, 1)).values
+            mask = (inputs > k_threshold).type(inputs.type())
+        return mask
+
+    @staticmethod
+    def backward(ctx, gradOutput):
+        return gradOutput, None, None
+
+
+class TopKBinarizer(autograd.Function):
+    """
+    Top-k Binarizer.
+    Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}`
+    is among the k% highest values of S.
+
+    Implementation is inspired from:
+        https://github.com/allenai/hidden-networks
+        What's hidden in a randomly weighted neural network?
+        Vivek Ramanujan*, Mitchell Wortsman*, Aniruddha Kembhavi, Ali Farhadi, Mohammad Rastegari
+    """
+
+    @staticmethod
+    def forward(ctx, inputs: torch.tensor, threshold: float):
+        """
+        Args:
+            inputs (`torch.FloatTensor`)
+                The input matrix from which the binarizer computes the binary mask.
+            threshold (`float`)
+                The percentage of weights to keep (the rest is pruned).
+                `threshold` is a float between 0 and 1.
+        Returns:
+            mask (`torch.FloatTensor`)
+                Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is
+                retained, 0 - the associated weight is pruned).
+        """
+        # Get the subnetwork by sorting the inputs and using the top threshold %
+        mask = inputs.clone()
+        _, idx = inputs.flatten().sort(descending=True)
+        j = int(threshold * inputs.numel())
+
+        # flat_out and mask access the same memory.
+        flat_out = mask.flatten()
+        flat_out[idx[j:]] = 0
+        flat_out[idx[:j]] = 1
+        return mask
+
+    @staticmethod
+    def backward(ctx, gradOutput):
+        return gradOutput, None
+
+
+class MagnitudeBinarizer(object):
+    """
+    Magnitude Binarizer.
+    Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}`
+    is among the k% highest values of |S| (absolute value).
+
+    Implementation is inspired from https://github.com/NervanaSystems/distiller/blob/2291fdcc2ea642a98d4e20629acb5a9e2e04b4e6/distiller/pruning/automated_gradual_pruner.py#L24
+    """
+
+    @staticmethod
+    def apply(inputs: torch.tensor, threshold: float):
+        """
+        Args:
+            inputs (`torch.FloatTensor`)
+                The input matrix from which the binarizer computes the binary mask.
+                This input marix is typically the weight matrix.
+            threshold (`float`)
+                The percentage of weights to keep (the rest is pruned).
+                `threshold` is a float between 0 and 1.
+        Returns:
+            mask (`torch.FloatTensor`)
+                Binary matrix of the same size as `inputs` acting as a mask (1 - the associated weight is
+                retained, 0 - the associated weight is pruned).
+        """
+        # Get the subnetwork by sorting the inputs and using the top threshold %
+        mask = inputs.clone()
+        _, idx = inputs.abs().flatten().sort(descending=True)
+        j = int(threshold * inputs.numel())
+
+        # flat_out and mask access the same memory.
+        flat_out = mask.flatten()
+        flat_out[idx[j:]] = 0
+        flat_out[idx[:j]] = 1
+        return mask
--- a/examples/movement-pruning/emmental/modules/masked_nn.py
+++ b/examples/movement-pruning/emmental/modules/masked_nn.py
@@ -0,0 +1,107 @@
+# coding=utf-8
+# Copyright 2020-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Masked Linear module: A fully connected layer that computes an adaptive binary mask on the fly.
+The mask (binary or not) is computed at each forward pass and multiplied against
+the weight matrix to prune a portion of the weights.
+The pruned weight matrix is then multiplied against the inputs (and if necessary, the bias is added).
+"""
+
+import math
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn import init
+
+from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
+
+
+class MaskedLinear(nn.Linear):
+    """
+    Fully Connected layer with on the fly adaptive mask.
+    If needed, a score matrix is created to store the importance of each associated weight.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        mask_init: str = "constant",
+        mask_scale: float = 0.0,
+        pruning_method: str = "topK",
+    ):
+        """
+        Args:
+            in_features (`int`)
+                Size of each input sample
+            out_features (`int`)
+                Size of each output sample
+            bias (`bool`)
+                If set to ``False``, the layer will not learn an additive bias.
+                Default: ``True``
+            mask_init (`str`)
+                The initialization method for the score matrix if a score matrix is needed.
+                Choices: ["constant", "uniform", "kaiming"]
+                Default: ``constant``
+            mask_scale (`float`)
+                The initialization parameter for the chosen initialization method `mask_init`.
+                Default: ``0.``
+            pruning_method (`str`)
+                Method to compute the mask.
+                Choices: ["topK", "threshold", "sigmoied_threshold", "magnitude", "l0"]
+                Default: ``topK``
+        """
+        super(MaskedLinear, self).__init__(in_features=in_features, out_features=out_features, bias=bias)
+        assert pruning_method in ["topK", "threshold", "sigmoied_threshold", "magnitude", "l0"]
+        self.pruning_method = pruning_method
+
+        if self.pruning_method in ["topK", "threshold", "sigmoied_threshold", "l0"]:
+            self.mask_scale = mask_scale
+            self.mask_init = mask_init
+            self.mask_scores = nn.Parameter(torch.Tensor(self.weight.size()))
+            self.init_mask()
+
+    def init_mask(self):
+        if self.mask_init == "constant":
+            init.constant_(self.mask_scores, val=self.mask_scale)
+        elif self.mask_init == "uniform":
+            init.uniform_(self.mask_scores, a=-self.mask_scale, b=self.mask_scale)
+        elif self.mask_init == "kaiming":
+            init.kaiming_uniform_(self.mask_scores, a=math.sqrt(5))
+
+    def forward(self, input: torch.tensor, threshold: float):
+        # Get the mask
+        if self.pruning_method == "topK":
+            mask = TopKBinarizer.apply(self.mask_scores, threshold)
+        elif self.pruning_method in ["threshold", "sigmoied_threshold"]:
+            sig = "sigmoied" in self.pruning_method
+            mask = ThresholdBinarizer.apply(self.mask_scores, threshold, sig)
+        elif self.pruning_method == "magnitude":
+            mask = MagnitudeBinarizer.apply(self.weight, threshold)
+        elif self.pruning_method == "l0":
+            l, r, b = -0.1, 1.1, 2 / 3
+            if self.training:
+                u = torch.zeros_like(self.mask_scores).uniform_().clamp(0.0001, 0.9999)
+                s = torch.sigmoid((u.log() - (1 - u).log() + self.mask_scores) / b)
+            else:
+                s = torch.sigmoid(self.mask_scores)
+            s_bar = s * (r - l) + l
+            mask = s_bar.clamp(min=0.0, max=1.0)
+        # Mask weights with computed mask
+        weight_thresholded = mask * self.weight
+        # Compute output (linear layer) with masked weights
+        return F.linear(input, weight_thresholded, self.bias)
--- a/examples/movement-pruning/masked_run_glue.py
+++ b/examples/movement-pruning/masked_run_glue.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa, Albert, XLM-RoBERTa)."""
-
+""" Fine-pruning Masked BERT on sequence classification on GLUE."""

 import argparse
 import glob
@@ -25,17 +24,19 @@ import random

 import numpy as np
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange

+from emmental import MaskedBertConfig, MaskedBertForSequenceClassification
 from transformers import (
-    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
    WEIGHTS_NAME,
    AdamW,
-    AutoConfig,
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
+    BertConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
    get_linear_schedule_with_warmup,
 )
 from transformers import glue_compute_metrics as compute_metrics
@@ -52,10 +53,10 @@ except ImportError:

 logger = logging.getLogger(__name__)

-MODEL_CONFIG_CLASSES = list(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in MODEL_CONFIG_CLASSES), (),)
+MODEL_CLASSES = {
+    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
+    "masked_bert": (MaskedBertConfig, MaskedBertForSequenceClassification, BertTokenizer),
+}


 def set_seed(args):
@@ -66,10 +67,47 @@ def set_seed(args):
        torch.cuda.manual_seed_all(args.seed)


-def train(args, train_dataset, model, tokenizer):
+def schedule_threshold(
+    step: int,
+    total_step: int,
+    warmup_steps: int,
+    initial_threshold: float,
+    final_threshold: float,
+    initial_warmup: int,
+    final_warmup: int,
+    final_lambda: float,
+):
+    if step <= initial_warmup * warmup_steps:
+        threshold = initial_threshold
+    elif step > (total_step - final_warmup * warmup_steps):
+        threshold = final_threshold
+    else:
+        spars_warmup_steps = initial_warmup * warmup_steps
+        spars_schedu_steps = (final_warmup + initial_warmup) * warmup_steps
+        mul_coeff = 1 - (step - spars_warmup_steps) / (total_step - spars_schedu_steps)
+        threshold = final_threshold + (initial_threshold - final_threshold) * (mul_coeff ** 3)
+    regu_lambda = final_lambda * threshold / final_threshold
+    return threshold, regu_lambda
+
+
+def regularization(model: nn.Module, mode: str):
+    regu, counter = 0, 0
+    for name, param in model.named_parameters():
+        if "mask_scores" in name:
+            if mode == "l1":
+                regu += torch.norm(torch.sigmoid(param), p=1) / param.numel()
+            elif mode == "l0":
+                regu += torch.sigmoid(param - 2 / 3 * np.log(0.1 / 1.1)).sum() / param.numel()
+            else:
+                ValueError("Don't know this mode.")
+            counter += 1
+    return regu / counter
+
+
+def train(args, train_dataset, model, tokenizer, teacher=None):
    """ Train the model """
    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
+        tb_writer = SummaryWriter(log_dir=args.output_dir)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
@@ -85,10 +123,27 @@ def train(args, train_dataset, model, tokenizer):
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "params": [p for n, p in model.named_parameters() if "mask_score" in n and p.requires_grad],
+            "lr": args.mask_scores_learning_rate,
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "mask_score" not in n and p.requires_grad and not any(nd in n for nd in no_decay)
+            ],
+            "lr": args.learning_rate,
            "weight_decay": args.weight_decay,
        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if "mask_score" not in n and p.requires_grad and any(nd in n for nd in no_decay)
+            ],
+            "lr": args.learning_rate,
+            "weight_decay": 0.0,
+        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
@@ -134,8 +189,14 @@ def train(args, train_dataset, model, tokenizer):
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
+    # Distillation
+    if teacher is not None:
+        logger.info("  Training with distillation")

    global_step = 0
+    # Global TopK
+    if args.global_topk:
+        threshold_mem = None
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
@@ -170,13 +231,67 @@ def train(args, train_dataset, model, tokenizer):

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
+            threshold, regu_lambda = schedule_threshold(
+                step=global_step,
+                total_step=t_total,
+                warmup_steps=args.warmup_steps,
+                final_threshold=args.final_threshold,
+                initial_threshold=args.initial_threshold,
+                final_warmup=args.final_warmup,
+                initial_warmup=args.initial_warmup,
+                final_lambda=args.final_lambda,
+            )
+            # Global TopK
+            if args.global_topk:
+                if threshold == 1.0:
+                    threshold = -1e2  # Or an indefinitely low quantity
+                else:
+                    if (threshold_mem is None) or (global_step % args.global_topk_frequency_compute == 0):
+                        # Sort all the values to get the global topK
+                        concat = torch.cat(
+                            [param.view(-1) for name, param in model.named_parameters() if "mask_scores" in name]
+                        )
+                        n = concat.numel()
+                        kth = max(n - (int(n * threshold) + 1), 1)
+                        threshold_mem = concat.kthvalue(kth).values.item()
+                        threshold = threshold_mem
+                    else:
+                        threshold = threshold_mem
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
-                    batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None
+                    batch[2] if args.model_type in ["bert", "masked_bert", "xlnet", "albert"] else None
                )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
+
+            if "masked" in args.model_type:
+                inputs["threshold"] = threshold
+
            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+            loss, logits_stu = outputs  # model outputs are always tuple in transformers (see doc)
+
+            # Distillation loss
+            if teacher is not None:
+                if "token_type_ids" not in inputs:
+                    inputs["token_type_ids"] = None if args.teacher_type == "xlm" else batch[2]
+                with torch.no_grad():
+                    (logits_tea,) = teacher(
+                        input_ids=inputs["input_ids"],
+                        token_type_ids=inputs["token_type_ids"],
+                        attention_mask=inputs["attention_mask"],
+                    )
+
+                loss_logits = F.kl_div(
+                    input=F.log_softmax(logits_stu / args.temperature, dim=-1),
+                    target=F.softmax(logits_tea / args.temperature, dim=-1),
+                    reduction="batchmean",
+                ) * (args.temperature ** 2)
+
+                loss = args.alpha_distil * loss_logits + args.alpha_ce * loss
+
+            # Regularization
+            if args.regularization is not None:
+                regu_ = regularization(model=model, mode=args.regularization)
+                loss = loss + regu_lambda * regu_

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
@@ -200,6 +315,24 @@ def train(args, train_dataset, model, tokenizer):
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    tb_writer.add_scalar("threshold", threshold, global_step)
+                    for name, param in model.named_parameters():
+                        if not param.requires_grad:
+                            continue
+                        tb_writer.add_scalar("parameter_mean/" + name, param.data.mean(), global_step)
+                        tb_writer.add_scalar("parameter_std/" + name, param.data.std(), global_step)
+                        tb_writer.add_scalar("parameter_min/" + name, param.data.min(), global_step)
+                        tb_writer.add_scalar("parameter_max/" + name, param.data.max(), global_step)
+                        tb_writer.add_scalar("grad_mean/" + name, param.grad.data.mean(), global_step)
+                        tb_writer.add_scalar("grad_std/" + name, param.grad.data.std(), global_step)
+                        if args.regularization is not None and "mask_scores" in name:
+                            if args.regularization == "l1":
+                                perc = (torch.sigmoid(param) > threshold).sum().item() / param.numel()
+                            elif args.regularization == "l0":
+                                perc = (torch.sigmoid(param - 2 / 3 * np.log(0.1 / 1.1))).sum().item() / param.numel()
+                            tb_writer.add_scalar("retained_weights_perc/" + name, perc, global_step)
+
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
@@ -216,9 +349,29 @@ def train(args, train_dataset, model, tokenizer):
                            logs[eval_key] = value

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
-                    learning_rate_scalar = scheduler.get_lr()[0]
-                    logs["learning_rate"] = learning_rate_scalar
+                    learning_rate_scalar = scheduler.get_lr()
+                    logs["learning_rate"] = learning_rate_scalar[0]
+                    if len(learning_rate_scalar) > 1:
+                        for idx, lr in enumerate(learning_rate_scalar[1:]):
+                            logs[f"learning_rate/{idx+1}"] = lr
                    logs["loss"] = loss_scalar
+                    if teacher is not None:
+                        logs["loss/distil"] = loss_logits.item()
+                    if args.regularization is not None:
+                        logs["loss/regularization"] = regu_.item()
+                    if (teacher is not None) or (args.regularization is not None):
+                        if (teacher is not None) and (args.regularization is not None):
+                            logs["loss/instant_ce"] = (
+                                loss.item()
+                                - regu_lambda * logs["loss/regularization"]
+                                - args.alpha_distil * logs["loss/distil"]
+                            ) / args.alpha_ce
+                        elif teacher is not None:
+                            logs["loss/instant_ce"] = (
+                                loss.item() - args.alpha_distil * logs["loss/distil"]
+                            ) / args.alpha_ce
+                        else:
+                            logs["loss/instant_ce"] = loss.item() - regu_lambda * logs["loss/regularization"]
                    logging_loss = tr_loss

                    for key, value in logs.items():
@@ -259,7 +412,7 @@ def train(args, train_dataset, model, tokenizer):
 def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
-    eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
+    eval_outputs_dirs = (args.output_dir, args.output_dir + "/MM") if args.task_name == "mnli" else (args.output_dir,)

    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
@@ -285,6 +438,11 @@ def evaluate(args, model, tokenizer, prefix=""):
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
+
+        # Global TopK
+        if args.global_topk:
+            threshold_mem = None
+
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)
@@ -293,8 +451,19 @@ def evaluate(args, model, tokenizer, prefix=""):
                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
                if args.model_type != "distilbert":
                    inputs["token_type_ids"] = (
-                        batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None
+                        batch[2] if args.model_type in ["bert", "masked_bert", "xlnet", "albert"] else None
                    )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
+                if "masked" in args.model_type:
+                    inputs["threshold"] = args.final_threshold
+                    if args.global_topk:
+                        if threshold_mem is None:
+                            concat = torch.cat(
+                                [param.view(-1) for name, param in model.named_parameters() if "mask_scores" in name]
+                            )
+                            n = concat.numel()
+                            kth = max(n - (int(n * args.final_threshold) + 1), 1)
+                            threshold_mem = concat.kthvalue(kth).values.item()
+                        inputs["threshold"] = threshold_mem
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

@@ -309,11 +478,17 @@ def evaluate(args, model, tokenizer, prefix=""):

        eval_loss = eval_loss / nb_eval_steps
        if args.output_mode == "classification":
+            from scipy.special import softmax
+
+            probs = softmax(preds, axis=-1)
+            entropy = np.exp((-probs * np.log(probs)).sum(axis=-1).mean())
            preds = np.argmax(preds, axis=1)
        elif args.output_mode == "regression":
            preds = np.squeeze(preds)
        result = compute_metrics(eval_task, preds, out_label_ids)
        results.update(result)
+        if entropy is not None:
+            result["eval_avg_entropy"] = entropy

        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
@@ -354,14 +529,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )
        features = convert_examples_to_features(
-            examples,
-            tokenizer,
-            label_list=label_list,
-            max_length=args.max_seq_length,
-            output_mode=output_mode,
-            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+            examples, tokenizer, max_length=args.max_seq_length, label_list=label_list, output_mode=output_mode,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -399,14 +567,14 @@ def main():
        default=None,
        type=str,
        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_TYPES),
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--task_name",
@@ -422,7 +590,6 @@ def main():
        required=True,
        help="The output directory where the model predictions and checkpoints will be written.",
    )
-
    # Other parameters
    parser.add_argument(
        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
@@ -461,13 +628,97 @@ def main():
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.",
    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+
+    # Pruning parameters
+    parser.add_argument(
+        "--mask_scores_learning_rate",
+        default=1e-2,
+        type=float,
+        help="The Adam initial learning rate of the mask scores.",
+    )
+    parser.add_argument(
+        "--initial_threshold", default=1.0, type=float, help="Initial value of the threshold (for scheduling)."
+    )
+    parser.add_argument(
+        "--final_threshold", default=0.7, type=float, help="Final value of the threshold (for scheduling)."
+    )
+    parser.add_argument(
+        "--initial_warmup",
+        default=1,
+        type=int,
+        help="Run `initial_warmup` * `warmup_steps` steps of threshold warmup during which threshold stays"
+        "at its `initial_threshold` value (sparsity schedule).",
+    )
+    parser.add_argument(
+        "--final_warmup",
+        default=2,
+        type=int,
+        help="Run `final_warmup` * `warmup_steps` steps of threshold cool-down during which threshold stays"
+        "at its final_threshold value (sparsity schedule).",
+    )
+
+    parser.add_argument(
+        "--pruning_method",
+        default="topK",
+        type=str,
+        help="Pruning Method (l0 = L0 regularization, magnitude = Magnitude pruning, topK = Movement pruning, sigmoied_threshold = Soft movement pruning).",
+    )
+    parser.add_argument(
+        "--mask_init",
+        default="constant",
+        type=str,
+        help="Initialization method for the mask scores. Choices: constant, uniform, kaiming.",
+    )
+    parser.add_argument(
+        "--mask_scale", default=0.0, type=float, help="Initialization parameter for the chosen initialization method."
+    )
+
+    parser.add_argument("--regularization", default=None, help="Add L0 or L1 regularization to the mask scores.")
+    parser.add_argument(
+        "--final_lambda",
+        default=0.0,
+        type=float,
+        help="Regularization intensity (used in conjunction with `regulariation`.",
+    )
+
+    parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
+    parser.add_argument(
+        "--global_topk_frequency_compute",
+        default=25,
+        type=int,
+        help="Frequency at which we compute the TopK global threshold.",
+    )
+
+    # Distillation parameters (optional)
+    parser.add_argument(
+        "--teacher_type",
+        default=None,
+        type=str,
+        help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.",
+    )
+    parser.add_argument(
+        "--teacher_name_or_path",
+        default=None,
+        type=str,
+        help="Path to the already fine-tuned teacher model. Only for distillation.",
+    )
+    parser.add_argument(
+        "--alpha_ce", default=0.5, type=float, help="Cross entropy loss linear weight. Only for distillation."
+    )
+    parser.add_argument(
+        "--alpha_distil", default=0.5, type=float, help="Distillation loss linear weight. Only for distillation."
+    )
+    parser.add_argument(
+        "--temperature", default=2.0, type=float, help="Distillation temperature. Only for distillation."
+    )
+
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
@@ -482,8 +733,8 @@ def main():
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

-    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
@@ -511,10 +762,13 @@ def main():
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
+
    args = parser.parse_args()

+    # Regularization
+    if args.regularization == "null":
+        args.regularization = None
+
    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
@@ -522,20 +776,9 @@ def main():
        and not args.overwrite_output_dir
    ):
        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
+            f"Output directory ({args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
@@ -579,24 +822,44 @@ def main():
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
-    config = AutoConfig.from_pretrained(
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        cache_dir=args.cache_dir if args.cache_dir else None,
+        pruning_method=args.pruning_method,
+        mask_init=args.mask_init,
+        mask_scale=args.mask_scale,
    )
-    tokenizer = AutoTokenizer.from_pretrained(
+    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
+        do_lower_case=args.do_lower_case,
    )
-    model = AutoModelForSequenceClassification.from_pretrained(
+    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

+    if args.teacher_type is not None:
+        assert args.teacher_name_or_path is not None
+        assert args.alpha_distil > 0.0
+        assert args.alpha_distil + args.alpha_ce > 0.0
+        teacher_config_class, teacher_model_class, _ = MODEL_CLASSES[args.teacher_type]
+        teacher_config = teacher_config_class.from_pretrained(args.teacher_name_or_path)
+        teacher = teacher_model_class.from_pretrained(
+            args.teacher_name_or_path,
+            from_tf=False,
+            config=teacher_config,
+            cache_dir=args.cache_dir if args.cache_dir else None,
+        )
+        teacher.to(args.device)
+    else:
+        teacher = None
+
    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

@@ -607,7 +870,7 @@ def main():
    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher=teacher)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
@@ -629,14 +892,14 @@ def main():
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
-        model = AutoModelForSequenceClassification.from_pretrained(args.output_dir)
-        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
-        tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
@@ -648,7 +911,7 @@ def main():
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

-            model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
+            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
--- a/examples/movement-pruning/masked_run_squad.py
+++ b/examples/movement-pruning/masked_run_squad.py
--- a/examples/movement-pruning/requirements.txt
+++ b/examples/movement-pruning/requirements.txt
@@ -0,0 +1,6 @@
+torch>=1.4.0
+-e git+https://github.com/huggingface/transformers.git@352d5472b0c1dec0f420d606d16747d851b4bda8#egg=transformers
+knockknock>=0.1.8.1
+h5py>=2.10.0
+numpy>=1.18.2
+scipy>=1.4.1
--- a/examples/multiple-choice/README.md
+++ b/examples/multiple-choice/README.md
@@ -0,0 +1,56 @@
+## Multiple Choice
+
+Based on the script [`run_multiple_choice.py`]().
+
+#### Fine-tuning on SWAG
+Download [swag](https://github.com/rowanz/swagaf/tree/master/data) data
+
+```bash
+#training on 4 tesla V100(16GB) GPUS
+export SWAG_DIR=/path/to/swag_data_dir
+python ./examples/multiple-choice/run_multiple_choice.py \
+--task_name swag \
+--model_name_or_path roberta-base \
+--do_train \
+--do_eval \
+--data_dir $SWAG_DIR \
+--learning_rate 5e-5 \
+--num_train_epochs 3 \
+--max_seq_length 80 \
+--output_dir models_bert/swag_base \
+--per_gpu_eval_batch_size=16 \
+--per_device_train_batch_size=16 \
+--gradient_accumulation_steps 2 \
+--overwrite_output
+```
+Training with the defined hyper-parameters yields the following results:
+```
+***** Eval results *****
+eval_acc = 0.8338998300509847
+eval_loss = 0.44457291918821606
+```
+
+
+## Tensorflow
+
+```bash
+export SWAG_DIR=/path/to/swag_data_dir
+python ./examples/multiple-choice/run_tf_multiple_choice.py \
+--task_name swag \
+--model_name_or_path bert-base-cased \
+--do_train \
+--do_eval \
+--data_dir $SWAG_DIR \
+--learning_rate 5e-5 \
+--num_train_epochs 3 \
+--max_seq_length 80 \
+--output_dir models_bert/swag_base \
+--per_gpu_eval_batch_size=16 \
+--per_device_train_batch_size=16 \
+--logging-dir logs \
+--gradient_accumulation_steps 2 \
+--overwrite_output
+```
+
+# Run it in colab
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ViktorAlm/notebooks/blob/master/MPC_GPU_Demo_for_TF_and_PT.ipynb)
--- a/examples/multiple-choice/run_multiple_choice.py
+++ b/examples/multiple-choice/run_multiple_choice.py
@@ -0,0 +1,229 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""
+
+
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Dict, Optional
+
+import numpy as np
+
+from transformers import (
+    AutoConfig,
+    AutoModelForMultipleChoice,
+    AutoTokenizer,
+    EvalPrediction,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    set_seed,
+)
+from utils_multiple_choice import MultipleChoiceDataset, Split, processors
+
+
+logger = logging.getLogger(__name__)
+
+
+def simple_accuracy(preds, labels):
+    return (preds == labels).mean()
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(processors.keys())})
+    data_dir: str = field(metadata={"help": "Should contain the data files for the task."})
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
+        )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        training_args.local_rank,
+        training_args.device,
+        training_args.n_gpu,
+        bool(training_args.local_rank != -1),
+        training_args.fp16,
+    )
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed
+    set_seed(training_args.seed)
+
+    try:
+        processor = processors[data_args.task_name]()
+        label_list = processor.get_labels()
+        num_labels = len(label_list)
+    except KeyError:
+        raise ValueError("Task not found: %s" % (data_args.task_name))
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=data_args.task_name,
+        cache_dir=model_args.cache_dir,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+    )
+    model = AutoModelForMultipleChoice.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+    )
+
+    # Get datasets
+    train_dataset = (
+        MultipleChoiceDataset(
+            data_dir=data_args.data_dir,
+            tokenizer=tokenizer,
+            task=data_args.task_name,
+            max_seq_length=data_args.max_seq_length,
+            overwrite_cache=data_args.overwrite_cache,
+            mode=Split.train,
+        )
+        if training_args.do_train
+        else None
+    )
+    eval_dataset = (
+        MultipleChoiceDataset(
+            data_dir=data_args.data_dir,
+            tokenizer=tokenizer,
+            task=data_args.task_name,
+            max_seq_length=data_args.max_seq_length,
+            overwrite_cache=data_args.overwrite_cache,
+            mode=Split.dev,
+        )
+        if training_args.do_eval
+        else None
+    )
+
+    def compute_metrics(p: EvalPrediction) -> Dict:
+        preds = np.argmax(p.predictions, axis=1)
+        return {"acc": simple_accuracy(preds, p.label_ids)}
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        compute_metrics=compute_metrics,
+    )
+
+    # Training
+    if training_args.do_train:
+        trainer.train(
+            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
+        )
+        trainer.save_model()
+        # For convenience, we also re-save the tokenizer to the same directory,
+        # so that you can share your model easily on huggingface.co/models =)
+        if trainer.is_world_master():
+            tokenizer.save_pretrained(training_args.output_dir)
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        result = trainer.evaluate()
+
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
+        if trainer.is_world_master():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key, value in result.items():
+                    logger.info("  %s = %s", key, value)
+                    writer.write("%s = %s\n" % (key, value))
+
+                results.update(result)
+
+    return results
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/multiple-choice/run_tf_multiple_choice.py
+++ b/examples/multiple-choice/run_tf_multiple_choice.py
@@ -0,0 +1,211 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""
+
+
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Dict, Optional
+
+import numpy as np
+
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    EvalPrediction,
+    HfArgumentParser,
+    TFAutoModelForMultipleChoice,
+    TFTrainer,
+    TFTrainingArguments,
+    set_seed,
+)
+from utils_multiple_choice import Split, TFMultipleChoiceDataset, processors
+
+
+logger = logging.getLogger(__name__)
+
+
+def simple_accuracy(preds, labels):
+    return (preds == labels).mean()
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(processors.keys())})
+    data_dir: str = field(metadata={"help": "Should contain the data files for the task."})
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
+        )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.warning(
+        "device: %s, n_gpu: %s, 16-bits training: %s", training_args.device, training_args.n_gpu, training_args.fp16,
+    )
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed
+    set_seed(training_args.seed)
+
+    try:
+        processor = processors[data_args.task_name]()
+        label_list = processor.get_labels()
+        num_labels = len(label_list)
+    except KeyError:
+        raise ValueError("Task not found: %s" % (data_args.task_name))
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=data_args.task_name,
+        cache_dir=model_args.cache_dir,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+    )
+    with training_args.strategy.scope():
+        model = TFAutoModelForMultipleChoice.from_pretrained(
+            model_args.model_name_or_path,
+            from_pt=bool(".bin" in model_args.model_name_or_path),
+            config=config,
+            cache_dir=model_args.cache_dir,
+        )
+    # Get datasets
+    train_dataset = (
+        TFMultipleChoiceDataset(
+            data_dir=data_args.data_dir,
+            tokenizer=tokenizer,
+            task=data_args.task_name,
+            max_seq_length=data_args.max_seq_length,
+            overwrite_cache=data_args.overwrite_cache,
+            mode=Split.train,
+        )
+        if training_args.do_train
+        else None
+    )
+    eval_dataset = (
+        TFMultipleChoiceDataset(
+            data_dir=data_args.data_dir,
+            tokenizer=tokenizer,
+            task=data_args.task_name,
+            max_seq_length=data_args.max_seq_length,
+            overwrite_cache=data_args.overwrite_cache,
+            mode=Split.dev,
+        )
+        if training_args.do_eval
+        else None
+    )
+
+    def compute_metrics(p: EvalPrediction) -> Dict:
+        preds = np.argmax(p.predictions, axis=1)
+        return {"acc": simple_accuracy(preds, p.label_ids)}
+
+    # Initialize our Trainer
+    trainer = TFTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset.get_dataset() if train_dataset else None,
+        eval_dataset=eval_dataset.get_dataset() if eval_dataset else None,
+        compute_metrics=compute_metrics,
+    )
+
+    # Training
+    if training_args.do_train:
+        trainer.train()
+        trainer.save_model()
+        tokenizer.save_pretrained(training_args.output_dir)
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        result = trainer.evaluate()
+
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results *****")
+            for key, value in result.items():
+                logger.info("  %s = %s", key, value)
+                writer.write("%s = %s\n" % (key, value))
+
+            results.update(result)
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/multiple-choice/utils_multiple_choice.py
+++ b/examples/multiple-choice/utils_multiple_choice.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension  """
+""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """


 import csv
@@ -21,48 +21,216 @@ import glob
 import json
 import logging
 import os
-from typing import List
+from dataclasses import dataclass
+from enum import Enum
+from typing import List, Optional

 import tqdm
+from filelock import FileLock

-from transformers import PreTrainedTokenizer
+from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available


 logger = logging.getLogger(__name__)


-class InputExample(object):
-    """A single training/test example for multiple choice"""
+@dataclass(frozen=True)
+class InputExample:
+    """
+    A single training/test example for multiple choice

-    def __init__(self, example_id, question, contexts, endings, label=None):
-        """Constructs a InputExample.
+    Args:
+        example_id: Unique id for the example.
+        question: string. The untokenized text of the second sequence (question).
+        contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
+        endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
+        label: (Optional) string. The label of the example. This should be
+        specified for train and dev examples, but not for test examples.
+    """

-        Args:
-            example_id: Unique id for the example.
-            contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
-            question: string. The untokenized text of the second sequence (question).
-            endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
-            label: (Optional) string. The label of the example. This should be
-            specified for train and dev examples, but not for test examples.
+    example_id: str
+    question: str
+    contexts: List[str]
+    endings: List[str]
+    label: Optional[str]
+
+
+@dataclass(frozen=True)
+class InputFeatures:
+    """
+    A single set of features of data.
+    Property names are the same names as the corresponding inputs to a model.
+    """
+
+    example_id: str
+    input_ids: List[List[int]]
+    attention_mask: Optional[List[List[int]]]
+    token_type_ids: Optional[List[List[int]]]
+    label: Optional[int]
+
+
+class Split(Enum):
+    train = "train"
+    dev = "dev"
+    test = "test"
+
+
+if is_torch_available():
+    import torch
+    from torch.utils.data.dataset import Dataset
+
+    class MultipleChoiceDataset(Dataset):
        """
-        self.example_id = example_id
-        self.question = question
-        self.contexts = contexts
-        self.endings = endings
-        self.label = label
+        This will be superseded by a framework-agnostic approach
+        soon.
+        """
+
+        features: List[InputFeatures]
+
+        def __init__(
+            self,
+            data_dir: str,
+            tokenizer: PreTrainedTokenizer,
+            task: str,
+            max_seq_length: Optional[int] = None,
+            overwrite_cache=False,
+            mode: Split = Split.train,
+        ):
+            processor = processors[task]()
+
+            cached_features_file = os.path.join(
+                data_dir,
+                "cached_{}_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length), task,),
+            )
+
+            # Make sure only the first process in distributed training processes the dataset,
+            # and the others will use the cache.
+            lock_path = cached_features_file + ".lock"
+            with FileLock(lock_path):
+
+                if os.path.exists(cached_features_file) and not overwrite_cache:
+                    logger.info(f"Loading features from cached file {cached_features_file}")
+                    self.features = torch.load(cached_features_file)
+                else:
+                    logger.info(f"Creating features from dataset file at {data_dir}")
+                    label_list = processor.get_labels()
+                    if mode == Split.dev:
+                        examples = processor.get_dev_examples(data_dir)
+                    elif mode == Split.test:
+                        examples = processor.get_test_examples(data_dir)
+                    else:
+                        examples = processor.get_train_examples(data_dir)
+                    logger.info("Training examples: %s", len(examples))
+                    # TODO clean up all this to leverage built-in features of tokenizers
+                    self.features = convert_examples_to_features(
+                        examples,
+                        label_list,
+                        max_seq_length,
+                        tokenizer,
+                        pad_on_left=bool(tokenizer.padding_side == "left"),
+                        pad_token=tokenizer.pad_token_id,
+                        pad_token_segment_id=tokenizer.pad_token_type_id,
+                    )
+                    logger.info("Saving features into cached file %s", cached_features_file)
+                    torch.save(self.features, cached_features_file)
+
+        def __len__(self):
+            return len(self.features)
+
+        def __getitem__(self, i) -> InputFeatures:
+            return self.features[i]


-class InputFeatures(object):
-    def __init__(self, example_id, choices_features, label):
-        self.example_id = example_id
-        self.choices_features = [
-            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
-            for input_ids, input_mask, segment_ids in choices_features
-        ]
-        self.label = label
+if is_tf_available():
+    import tensorflow as tf
+
+    class TFMultipleChoiceDataset:
+        """
+        This will be superseded by a framework-agnostic approach
+        soon.
+        """
+
+        features: List[InputFeatures]
+
+        def __init__(
+            self,
+            data_dir: str,
+            tokenizer: PreTrainedTokenizer,
+            task: str,
+            max_seq_length: Optional[int] = 128,
+            overwrite_cache=False,
+            mode: Split = Split.train,
+        ):
+            processor = processors[task]()
+
+            logger.info(f"Creating features from dataset file at {data_dir}")
+            label_list = processor.get_labels()
+            if mode == Split.dev:
+                examples = processor.get_dev_examples(data_dir)
+            elif mode == Split.test:
+                examples = processor.get_test_examples(data_dir)
+            else:
+                examples = processor.get_train_examples(data_dir)
+            logger.info("Training examples: %s", len(examples))
+            # TODO clean up all this to leverage built-in features of tokenizers
+            self.features = convert_examples_to_features(
+                examples,
+                label_list,
+                max_seq_length,
+                tokenizer,
+                pad_on_left=bool(tokenizer.padding_side == "left"),
+                pad_token=tokenizer.pad_token_id,
+                pad_token_segment_id=tokenizer.pad_token_type_id,
+            )
+
+            def gen():
+                for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
+                    if ex_index % 10000 == 0:
+                        logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+                    yield (
+                        {
+                            "example_id": 0,
+                            "input_ids": ex.input_ids,
+                            "attention_mask": ex.attention_mask,
+                            "token_type_ids": ex.token_type_ids,
+                        },
+                        ex.label,
+                    )
+
+            self.dataset = tf.data.Dataset.from_generator(
+                gen,
+                (
+                    {
+                        "example_id": tf.int32,
+                        "input_ids": tf.int32,
+                        "attention_mask": tf.int32,
+                        "token_type_ids": tf.int32,
+                    },
+                    tf.int64,
+                ),
+                (
+                    {
+                        "example_id": tf.TensorShape([]),
+                        "input_ids": tf.TensorShape([None, None]),
+                        "attention_mask": tf.TensorShape([None, None]),
+                        "token_type_ids": tf.TensorShape([None, None]),
+                    },
+                    tf.TensorShape([]),
+                ),
+            )
+
+        def get_dataset(self):
+            return self.dataset
+
+        def __len__(self):
+            return len(self.features)
+
+        def __getitem__(self, i) -> InputFeatures:
+            return self.features[i]


-class DataProcessor(object):
+class DataProcessor:
    """Base class for data converters for multiple choice data sets."""

    def get_train_examples(self, data_dir):
@@ -149,6 +317,52 @@ class RaceProcessor(DataProcessor):
        return examples


+class SynonymProcessor(DataProcessor):
+    """Processor for the Synonym data set."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {} train".format(data_dir))
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "mctrain.csv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {} dev".format(data_dir))
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "mchp.csv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {} dev".format(data_dir))
+
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "mctest.csv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1", "2", "3", "4"]
+
+    def _read_csv(self, input_file):
+        with open(input_file, "r", encoding="utf-8") as f:
+            return list(csv.reader(f))
+
+    def _create_examples(self, lines: List[List[str]], type: str):
+        """Creates examples for the training and dev sets."""
+
+        examples = [
+            InputExample(
+                example_id=line[0],
+                question="",  # in the swag dataset, the
+                # common beginning of each
+                # choice is stored in "sent2".
+                contexts=[line[1], line[1], line[1], line[1], line[1]],
+                endings=[line[2], line[3], line[4], line[5], line[6]],
+                label=line[7],
+            )
+            for line in lines  # we skip the line with the column names
+        ]
+
+        return examples
+
+
 class SwagProcessor(DataProcessor):
    """Processor for the SWAG data set."""

@@ -311,7 +525,7 @@ def convert_examples_to_features(
    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
-        choices_features = []
+        choices_inputs = []
        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
            text_a = context
            if example.question.find("_") != -1:
@@ -320,7 +534,14 @@ def convert_examples_to_features(
            else:
                text_b = example.question + " " + ending

-            inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=max_length,)
+            inputs = tokenizer.encode_plus(
+                text_a,
+                text_b,
+                add_special_tokens=True,
+                max_length=max_length,
+                pad_to_max_length=True,
+                return_overflowing_tokens=True,
+            )
            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
                logger.info(
                    "Attention! you are cropping tokens (swag task is ok). "
@@ -328,46 +549,34 @@ def convert_examples_to_features(
                    "you need to try to use a bigger max seq length!"
                )

-            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
-
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            padding_length = max_length - len(input_ids)
-            if pad_on_left:
-                input_ids = ([pad_token] * padding_length) + input_ids
-                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
-                token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
-            else:
-                input_ids = input_ids + ([pad_token] * padding_length)
-                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-                token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
-
-            assert len(input_ids) == max_length
-            assert len(attention_mask) == max_length
-            assert len(token_type_ids) == max_length
-            choices_features.append((input_ids, attention_mask, token_type_ids))
+            choices_inputs.append(inputs)

        label = label_map[example.label]

-        if ex_index < 2:
-            logger.info("*** Example ***")
-            logger.info("race_id: {}".format(example.example_id))
-            for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
-                logger.info("choice: {}".format(choice_idx))
-                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
-                logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask))))
-                logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids))))
-                logger.info("label: {}".format(label))
+        input_ids = [x["input_ids"] for x in choices_inputs]
+        attention_mask = (
+            [x["attention_mask"] for x in choices_inputs] if "attention_mask" in choices_inputs[0] else None
+        )
+        token_type_ids = (
+            [x["token_type_ids"] for x in choices_inputs] if "token_type_ids" in choices_inputs[0] else None
+        )

-        features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,))
+        features.append(
+            InputFeatures(
+                example_id=example.example_id,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                label=label,
+            )
+        )
+
+    for f in features[:2]:
+        logger.info("*** Example ***")
+        logger.info("feature: %s" % f)

    return features


-processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor}
-
-
-MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4}
+processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor, "syn": SynonymProcessor}
+MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4, "syn", 5}
--- a/examples/ner/run_ner.py
+++ b/examples/ner/run_ner.py
@@ -1,677 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert or Roberta). """
-
-
-import argparse
-import glob
-import logging
-import os
-import random
-
-import numpy as np
-import torch
-from seqeval.metrics import f1_score, precision_score, recall_score
-from torch.nn import CrossEntropyLoss
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from transformers import (
-    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-    WEIGHTS_NAME,
-    AdamW,
-    AutoConfig,
-    AutoModelForTokenClassification,
-    AutoTokenizer,
-    get_linear_schedule_with_warmup,
-)
-from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-MODEL_CONFIG_CLASSES = list(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in MODEL_CONFIG_CLASSES), ())
-
-TOKENIZER_ARGS = ["do_lower_case", "strip_accents", "keep_accents", "use_fast"]
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
-    """ Train the model """
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-
-    # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
-        os.path.join(args.model_name_or_path, "scheduler.pt")
-    ):
-        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
-
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    epochs_trained = 0
-    steps_trained_in_current_epoch = 0
-    # Check if continuing training from a checkpoint
-    if os.path.exists(args.model_name_or_path):
-        # set global_step to gobal_step of last saved checkpoint from model path
-        try:
-            global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
-        except ValueError:
-            global_step = 0
-        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
-        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
-
-        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-        logger.info("  Continuing training from epoch %d", epochs_trained)
-        logger.info("  Continuing training from global step %d", global_step)
-        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
-
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
-    )
-    set_seed(args)  # Added here for reproductibility
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-
-            # Skip past any already trained steps if resuming training
-            if steps_trained_in_current_epoch > 0:
-                steps_trained_in_current_epoch -= 1
-                continue
-
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = (
-                    batch[2] if args.model_type in ["bert", "xlnet"] else None
-                )  # XLM and RoBERTa don"t use segment_ids
-
-            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Log metrics
-                    if (
-                        args.local_rank == -1 and args.evaluate_during_training
-                    ):  # Only evaluate when single GPU otherwise metrics may not average well
-                        results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev")
-                        for key, value in results.items():
-                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
-                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
-                    logging_loss = tr_loss
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    tokenizer.save_pretrained(output_dir)
-
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
-                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""):
-    eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
-    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-    # multi-gpu evaluate
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Eval!
-    logger.info("***** Running evaluation %s *****", prefix)
-    logger.info("  Num examples = %d", len(eval_dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-    eval_loss = 0.0
-    nb_eval_steps = 0
-    preds = None
-    out_label_ids = None
-    model.eval()
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        batch = tuple(t.to(args.device) for t in batch)
-
-        with torch.no_grad():
-            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-            if args.model_type != "distilbert":
-                inputs["token_type_ids"] = (
-                    batch[2] if args.model_type in ["bert", "xlnet"] else None
-                )  # XLM and RoBERTa don"t use segment_ids
-            outputs = model(**inputs)
-            tmp_eval_loss, logits = outputs[:2]
-
-            if args.n_gpu > 1:
-                tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu parallel evaluating
-
-            eval_loss += tmp_eval_loss.item()
-        nb_eval_steps += 1
-        if preds is None:
-            preds = logits.detach().cpu().numpy()
-            out_label_ids = inputs["labels"].detach().cpu().numpy()
-        else:
-            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-            out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
-
-    eval_loss = eval_loss / nb_eval_steps
-    preds = np.argmax(preds, axis=2)
-
-    label_map = {i: label for i, label in enumerate(labels)}
-
-    out_label_list = [[] for _ in range(out_label_ids.shape[0])]
-    preds_list = [[] for _ in range(out_label_ids.shape[0])]
-
-    for i in range(out_label_ids.shape[0]):
-        for j in range(out_label_ids.shape[1]):
-            if out_label_ids[i, j] != pad_token_label_id:
-                out_label_list[i].append(label_map[out_label_ids[i][j]])
-                preds_list[i].append(label_map[preds[i][j]])
-
-    results = {
-        "loss": eval_loss,
-        "precision": precision_score(out_label_list, preds_list),
-        "recall": recall_score(out_label_list, preds_list),
-        "f1": f1_score(out_label_list, preds_list),
-    }
-
-    logger.info("***** Eval results %s *****", prefix)
-    for key in sorted(results.keys()):
-        logger.info("  %s = %s", key, str(results[key]))
-
-    return results, preds_list
-
-
-def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
-    if args.local_rank not in [-1, 0] and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(
-        args.data_dir,
-        "cached_{}_{}_{}".format(
-            mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length)
-        ),
-    )
-    if os.path.exists(cached_features_file) and not args.overwrite_cache:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file)
-    else:
-        logger.info("Creating features from dataset file at %s", args.data_dir)
-        examples = read_examples_from_file(args.data_dir, mode)
-        features = convert_examples_to_features(
-            examples,
-            labels,
-            args.max_seq_length,
-            tokenizer,
-            cls_token_at_end=bool(args.model_type in ["xlnet"]),
-            # xlnet has a cls token at the end
-            cls_token=tokenizer.cls_token,
-            cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
-            sep_token=tokenizer.sep_token,
-            sep_token_extra=bool(args.model_type in ["roberta"]),
-            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
-            pad_on_left=bool(args.model_type in ["xlnet"]),
-            # pad on the left for xlnet
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
-            pad_token_label_id=pad_token_label_id,
-        )
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save(features, cached_features_file)
-
-    if args.local_rank == 0 and not evaluate:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
-    all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
-
-    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-    return dataset
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
-    )
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_TYPES),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--labels",
-        default="",
-        type=str,
-        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
-    )
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help="The maximum total input sequence length after tokenization. Sequences longer "
-        "than this will be truncated, sequences shorter will be padded.",
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
-    parser.add_argument(
-        "--evaluate_during_training",
-        action="store_true",
-        help="Whether to run evaluation during training at each logging step.",
-    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )
-    parser.add_argument(
-        "--keep_accents", action="store_const", const=True, help="Set this flag if model is trained with accents."
-    )
-    parser.add_argument(
-        "--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents."
-    )
-    parser.add_argument("--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization.")
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-
-    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-        "See details at https://nvidia.github.io/apex/amp.html",
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
-    args = parser.parse_args()
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-
-    # Set seed
-    set_seed(args)
-
-    # Prepare CONLL-2003 task
-    labels = get_labels(args.labels)
-    num_labels = len(labels)
-    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
-    pad_token_label_id = CrossEntropyLoss().ignore_index
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    args.model_type = args.model_type.lower()
-    config = AutoConfig.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        num_labels=num_labels,
-        id2label={str(i): label for i, label in enumerate(labels)},
-        label2id={label: i for i, label in enumerate(labels)},
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    tokenizer_args = {k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS}
-    logger.info("Tokenizer arguments: %s", tokenizer_args)
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-        **tokenizer_args,
-    )
-    model = AutoModelForTokenClassification.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args.do_train:
-        train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train")
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-    # Evaluation
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        tokenizer = AutoTokenizer.from_pretrained(args.output_dir, **tokenizer_args)
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = list(
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            model = AutoModelForTokenClassification.from_pretrained(checkpoint)
-            model.to(args.device)
-            result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step)
-            if global_step:
-                result = {"{}_{}".format(global_step, k): v for k, v in result.items()}
-            results.update(result)
-        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            for key in sorted(results.keys()):
-                writer.write("{} = {}\n".format(key, str(results[key])))
-
-    if args.do_predict and args.local_rank in [-1, 0]:
-        tokenizer = AutoTokenizer.from_pretrained(args.output_dir, **tokenizer_args)
-        model = AutoModelForTokenClassification.from_pretrained(args.output_dir)
-        model.to(args.device)
-        result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")
-        # Save results
-        output_test_results_file = os.path.join(args.output_dir, "test_results.txt")
-        with open(output_test_results_file, "w") as writer:
-            for key in sorted(result.keys()):
-                writer.write("{} = {}\n".format(key, str(result[key])))
-        # Save predictions
-        output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt")
-        with open(output_test_predictions_file, "w") as writer:
-            with open(os.path.join(args.data_dir, "test.txt"), "r") as f:
-                example_id = 0
-                for line in f:
-                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
-                        writer.write(line)
-                        if not predictions[example_id]:
-                            example_id += 1
-                    elif predictions[example_id]:
-                        output_line = line.split()[0] + " " + predictions[example_id].pop(0) + "\n"
-                        writer.write(output_line)
-                    else:
-                        logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/ner/run_tf_ner.py
+++ b/examples/ner/run_tf_ner.py
@@ -1,644 +0,0 @@
-# coding=utf-8
-import collections
-import datetime
-import glob
-import math
-import os
-import re
-
-import numpy as np
-import tensorflow as tf
-from absl import app, flags, logging
-from seqeval import metrics
-
-from transformers import (
-    TF2_WEIGHTS_NAME,
-    TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
-    AutoConfig,
-    AutoTokenizer,
-    GradientAccumulator,
-    TFAutoModelForTokenClassification,
-    create_optimizer,
-)
-from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
-
-
-try:
-    from fastprogress import master_bar, progress_bar
-except ImportError:
-    from fastprogress.fastprogress import master_bar, progress_bar
-
-
-MODEL_CONFIG_CLASSES = list(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in MODEL_CONFIG_CLASSES), (),)
-
-
-flags.DEFINE_string(
-    "data_dir", None, "The input data dir. Should contain the .conll files (or other data files) " "for the task."
-)
-
-flags.DEFINE_string("model_type", None, "Model type selected in the list: " + ", ".join(MODEL_TYPES))
-
-flags.DEFINE_string(
-    "model_name_or_path",
-    None,
-    "Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
-)
-
-flags.DEFINE_string("output_dir", None, "The output directory where the model checkpoints will be written.")
-
-flags.DEFINE_string(
-    "labels", "", "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."
-)
-
-flags.DEFINE_string("config_name", "", "Pretrained config name or path if not the same as model_name")
-
-flags.DEFINE_string("tokenizer_name", "", "Pretrained tokenizer name or path if not the same as model_name")
-
-flags.DEFINE_string("cache_dir", "", "Where do you want to store the pre-trained models downloaded from s3")
-
-flags.DEFINE_integer(
-    "max_seq_length",
-    128,
-    "The maximum total input sentence length after tokenization. "
-    "Sequences longer than this will be truncated, sequences shorter "
-    "will be padded.",
-)
-
-flags.DEFINE_string(
-    "tpu",
-    None,
-    "The Cloud TPU to use for training. This should be either the name "
-    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
-    "url.",
-)
-
-flags.DEFINE_integer("num_tpu_cores", 8, "Total number of TPU cores to use.")
-
-flags.DEFINE_boolean("do_train", False, "Whether to run training.")
-
-flags.DEFINE_boolean("do_eval", False, "Whether to run eval on the dev set.")
-
-flags.DEFINE_boolean("do_predict", False, "Whether to run predictions on the test set.")
-
-flags.DEFINE_boolean(
-    "evaluate_during_training", False, "Whether to run evaluation during training at each logging step."
-)
-
-flags.DEFINE_boolean("do_lower_case", False, "Set this flag if you are using an uncased model.")
-
-flags.DEFINE_integer("per_device_train_batch_size", 8, "Batch size per GPU/CPU/TPU for training.")
-
-flags.DEFINE_integer("per_device_eval_batch_size", 8, "Batch size per GPU/CPU/TPU for evaluation.")
-
-flags.DEFINE_integer(
-    "gradient_accumulation_steps", 1, "Number of updates steps to accumulate before performing a backward/update pass."
-)
-
-flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
-
-flags.DEFINE_float("weight_decay", 0.0, "Weight decay if we apply some.")
-
-flags.DEFINE_float("adam_epsilon", 1e-8, "Epsilon for Adam optimizer.")
-
-flags.DEFINE_float("max_grad_norm", 1.0, "Max gradient norm.")
-
-flags.DEFINE_integer("num_train_epochs", 3, "Total number of training epochs to perform.")
-
-flags.DEFINE_integer(
-    "max_steps", -1, "If > 0: set total number of training steps to perform. Override num_train_epochs."
-)
-
-flags.DEFINE_integer("warmup_steps", 0, "Linear warmup over warmup_steps.")
-
-flags.DEFINE_integer("logging_steps", 50, "Log every X updates steps.")
-
-flags.DEFINE_integer("save_steps", 50, "Save checkpoint every X updates steps.")
-
-flags.DEFINE_boolean(
-    "eval_all_checkpoints",
-    False,
-    "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-)
-
-flags.DEFINE_boolean("no_cuda", False, "Avoid using CUDA when available")
-
-flags.DEFINE_boolean("overwrite_output_dir", False, "Overwrite the content of the output directory")
-
-flags.DEFINE_boolean("overwrite_cache", False, "Overwrite the cached training and evaluation sets")
-
-flags.DEFINE_integer("seed", 42, "random seed for initialization")
-
-flags.DEFINE_boolean("fp16", False, "Whether to use 16-bit (mixed) precision instead of 32-bit")
-
-flags.DEFINE_string(
-    "gpus",
-    "0",
-    "Comma separated list of gpus devices. If only one, switch to single "
-    "gpu strategy, if None takes all the gpus available.",
-)
-
-
-def train(
-    args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id
-):
-    if args["max_steps"] > 0:
-        num_train_steps = args["max_steps"] * args["gradient_accumulation_steps"]
-        args["num_train_epochs"] = 1
-    else:
-        num_train_steps = (
-            math.ceil(num_train_examples / train_batch_size)
-            // args["gradient_accumulation_steps"]
-            * args["num_train_epochs"]
-        )
-
-    writer = tf.summary.create_file_writer("/tmp/mylogs")
-
-    with strategy.scope():
-        loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
-        optimizer = create_optimizer(args["learning_rate"], num_train_steps, args["warmup_steps"])
-
-        if args["fp16"]:
-            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
-
-        loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32)
-        gradient_accumulator = GradientAccumulator()
-
-    logging.info("***** Running training *****")
-    logging.info("  Num examples = %d", num_train_examples)
-    logging.info("  Num Epochs = %d", args["num_train_epochs"])
-    logging.info("  Instantaneous batch size per device = %d", args["per_device_train_batch_size"])
-    logging.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        train_batch_size * args["gradient_accumulation_steps"],
-    )
-    logging.info("  Gradient Accumulation steps = %d", args["gradient_accumulation_steps"])
-    logging.info("  Total training steps = %d", num_train_steps)
-
-    model.summary()
-
-    @tf.function
-    def apply_gradients():
-        grads_and_vars = []
-
-        for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables):
-            if gradient is not None:
-                scaled_gradient = gradient / (args["n_device"] * args["gradient_accumulation_steps"])
-                grads_and_vars.append((scaled_gradient, variable))
-            else:
-                grads_and_vars.append((gradient, variable))
-
-        optimizer.apply_gradients(grads_and_vars, args["max_grad_norm"])
-        gradient_accumulator.reset()
-
-    @tf.function
-    def train_step(train_features, train_labels):
-        def step_fn(train_features, train_labels):
-            inputs = {"attention_mask": train_features["input_mask"], "training": True}
-
-            if args["model_type"] != "distilbert":
-                inputs["token_type_ids"] = (
-                    train_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None
-                )
-
-            with tf.GradientTape() as tape:
-                logits = model(train_features["input_ids"], **inputs)[0]
-                logits = tf.reshape(logits, (-1, len(labels) + 1))
-                active_loss = tf.reshape(train_features["input_mask"], (-1,))
-                active_logits = tf.boolean_mask(logits, active_loss)
-                train_labels = tf.reshape(train_labels, (-1,))
-                active_labels = tf.boolean_mask(train_labels, active_loss)
-                cross_entropy = loss_fct(active_labels, active_logits)
-                loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size)
-                grads = tape.gradient(loss, model.trainable_variables)
-
-                gradient_accumulator(grads)
-
-            return cross_entropy
-
-        per_example_losses = strategy.experimental_run_v2(step_fn, args=(train_features, train_labels))
-        mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0)
-
-        return mean_loss
-
-    current_time = datetime.datetime.now()
-    train_iterator = master_bar(range(args["num_train_epochs"]))
-    global_step = 0
-    logging_loss = 0.0
-
-    for epoch in train_iterator:
-        epoch_iterator = progress_bar(
-            train_dataset, total=num_train_steps, parent=train_iterator, display=args["n_device"] > 1
-        )
-        step = 1
-
-        with strategy.scope():
-            for train_features, train_labels in epoch_iterator:
-                loss = train_step(train_features, train_labels)
-
-                if step % args["gradient_accumulation_steps"] == 0:
-                    strategy.experimental_run_v2(apply_gradients)
-
-                    loss_metric(loss)
-
-                    global_step += 1
-
-                    if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0:
-                        # Log metrics
-                        if (
-                            args["n_device"] == 1 and args["evaluate_during_training"]
-                        ):  # Only evaluate when single GPU otherwise metrics may not average well
-                            y_true, y_pred, eval_loss = evaluate(
-                                args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev"
-                            )
-                            report = metrics.classification_report(y_true, y_pred, digits=4)
-
-                            logging.info("Eval at step " + str(global_step) + "\n" + report)
-                            logging.info("eval_loss: " + str(eval_loss))
-
-                            precision = metrics.precision_score(y_true, y_pred)
-                            recall = metrics.recall_score(y_true, y_pred)
-                            f1 = metrics.f1_score(y_true, y_pred)
-
-                            with writer.as_default():
-                                tf.summary.scalar("eval_loss", eval_loss, global_step)
-                                tf.summary.scalar("precision", precision, global_step)
-                                tf.summary.scalar("recall", recall, global_step)
-                                tf.summary.scalar("f1", f1, global_step)
-
-                        lr = optimizer.learning_rate
-                        learning_rate = lr(step)
-
-                        with writer.as_default():
-                            tf.summary.scalar("lr", learning_rate, global_step)
-                            tf.summary.scalar(
-                                "loss", (loss_metric.result() - logging_loss) / args["logging_steps"], global_step
-                            )
-
-                        logging_loss = loss_metric.result()
-
-                    with writer.as_default():
-                        tf.summary.scalar("loss", loss_metric.result(), step=step)
-
-                    if args["save_steps"] > 0 and global_step % args["save_steps"] == 0:
-                        # Save model checkpoint
-                        output_dir = os.path.join(args["output_dir"], "checkpoint-{}".format(global_step))
-
-                        if not os.path.exists(output_dir):
-                            os.makedirs(output_dir)
-
-                        model.save_pretrained(output_dir)
-                        logging.info("Saving model checkpoint to %s", output_dir)
-
-                train_iterator.child.comment = f"loss : {loss_metric.result()}"
-                step += 1
-
-        train_iterator.write(f"loss epoch {epoch + 1}: {loss_metric.result()}")
-
-        loss_metric.reset_states()
-
-    logging.info("  Training took time = {}".format(datetime.datetime.now() - current_time))
-
-
-def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode):
-    eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"]
-    eval_dataset, size = load_and_cache_examples(
-        args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode
-    )
-    eval_dataset = strategy.experimental_distribute_dataset(eval_dataset)
-    preds = None
-    num_eval_steps = math.ceil(size / eval_batch_size)
-    master = master_bar(range(1))
-    eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args["n_device"] > 1)
-    loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
-    loss = 0.0
-
-    logging.info("***** Running evaluation *****")
-    logging.info("  Num examples = %d", size)
-    logging.info("  Batch size = %d", eval_batch_size)
-
-    for eval_features, eval_labels in eval_iterator:
-        inputs = {"attention_mask": eval_features["input_mask"], "training": False}
-
-        if args["model_type"] != "distilbert":
-            inputs["token_type_ids"] = (
-                eval_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None
-            )
-
-        with strategy.scope():
-            logits = model(eval_features["input_ids"], **inputs)[0]
-            tmp_logits = tf.reshape(logits, (-1, len(labels) + 1))
-            active_loss = tf.reshape(eval_features["input_mask"], (-1,))
-            active_logits = tf.boolean_mask(tmp_logits, active_loss)
-            tmp_eval_labels = tf.reshape(eval_labels, (-1,))
-            active_labels = tf.boolean_mask(tmp_eval_labels, active_loss)
-            cross_entropy = loss_fct(active_labels, active_logits)
-            loss += tf.reduce_sum(cross_entropy) * (1.0 / eval_batch_size)
-
-        if preds is None:
-            preds = logits.numpy()
-            label_ids = eval_labels.numpy()
-        else:
-            preds = np.append(preds, logits.numpy(), axis=0)
-            label_ids = np.append(label_ids, eval_labels.numpy(), axis=0)
-
-    preds = np.argmax(preds, axis=2)
-    y_pred = [[] for _ in range(label_ids.shape[0])]
-    y_true = [[] for _ in range(label_ids.shape[0])]
-    loss = loss / num_eval_steps
-
-    for i in range(label_ids.shape[0]):
-        for j in range(label_ids.shape[1]):
-            if label_ids[i, j] != pad_token_label_id:
-                y_pred[i].append(labels[preds[i, j] - 1])
-                y_true[i].append(labels[label_ids[i, j] - 1])
-
-    return y_true, y_pred, loss.numpy()
-
-
-def load_cache(cached_file, max_seq_length):
-    name_to_features = {
-        "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
-        "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
-        "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
-        "label_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
-    }
-
-    def _decode_record(record):
-        example = tf.io.parse_single_example(record, name_to_features)
-        features = {}
-        features["input_ids"] = example["input_ids"]
-        features["input_mask"] = example["input_mask"]
-        features["segment_ids"] = example["segment_ids"]
-
-        return features, example["label_ids"]
-
-    d = tf.data.TFRecordDataset(cached_file)
-    d = d.map(_decode_record, num_parallel_calls=4)
-    count = d.reduce(0, lambda x, _: x + 1)
-
-    return d, count.numpy()
-
-
-def save_cache(features, cached_features_file):
-    writer = tf.io.TFRecordWriter(cached_features_file)
-
-    for (ex_index, feature) in enumerate(features):
-        if ex_index % 5000 == 0:
-            logging.info("Writing example %d of %d" % (ex_index, len(features)))
-
-        def create_int_feature(values):
-            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-            return f
-
-        record_feature = collections.OrderedDict()
-        record_feature["input_ids"] = create_int_feature(feature.input_ids)
-        record_feature["input_mask"] = create_int_feature(feature.input_mask)
-        record_feature["segment_ids"] = create_int_feature(feature.segment_ids)
-        record_feature["label_ids"] = create_int_feature(feature.label_ids)
-
-        tf_example = tf.train.Example(features=tf.train.Features(feature=record_feature))
-
-        writer.write(tf_example.SerializeToString())
-
-    writer.close()
-
-
-def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_size, mode):
-    drop_remainder = True if args["tpu"] or mode == "train" else False
-
-    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(
-        args["data_dir"],
-        "cached_{}_{}_{}.tf_record".format(
-            mode, list(filter(None, args["model_name_or_path"].split("/"))).pop(), str(args["max_seq_length"])
-        ),
-    )
-    if os.path.exists(cached_features_file) and not args["overwrite_cache"]:
-        logging.info("Loading features from cached file %s", cached_features_file)
-        dataset, size = load_cache(cached_features_file, args["max_seq_length"])
-    else:
-        logging.info("Creating features from dataset file at %s", args["data_dir"])
-        examples = read_examples_from_file(args["data_dir"], mode)
-        features = convert_examples_to_features(
-            examples,
-            labels,
-            args["max_seq_length"],
-            tokenizer,
-            cls_token_at_end=bool(args["model_type"] in ["xlnet"]),
-            # xlnet has a cls token at the end
-            cls_token=tokenizer.cls_token,
-            cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0,
-            sep_token=tokenizer.sep_token,
-            sep_token_extra=bool(args["model_type"] in ["roberta"]),
-            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
-            pad_on_left=bool(args["model_type"] in ["xlnet"]),
-            # pad on the left for xlnet
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
-            pad_token_label_id=pad_token_label_id,
-        )
-        logging.info("Saving features into cached file %s", cached_features_file)
-        save_cache(features, cached_features_file)
-        dataset, size = load_cache(cached_features_file, args["max_seq_length"])
-
-    if mode == "train":
-        dataset = dataset.repeat()
-        dataset = dataset.shuffle(buffer_size=8192, seed=args["seed"])
-
-    dataset = dataset.batch(batch_size, drop_remainder)
-    dataset = dataset.prefetch(buffer_size=batch_size)
-
-    return dataset, size
-
-
-def main(_):
-    logging.set_verbosity(logging.INFO)
-    args = flags.FLAGS.flag_values_dict()
-
-    if (
-        os.path.exists(args["output_dir"])
-        and os.listdir(args["output_dir"])
-        and args["do_train"]
-        and not args["overwrite_output_dir"]
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args["output_dir"]
-            )
-        )
-
-    if args["fp16"]:
-        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
-
-    if args["tpu"]:
-        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args["tpu"])
-        tf.config.experimental_connect_to_cluster(resolver)
-        tf.tpu.experimental.initialize_tpu_system(resolver)
-        strategy = tf.distribute.experimental.TPUStrategy(resolver)
-        args["n_device"] = args["num_tpu_cores"]
-    elif len(args["gpus"].split(",")) > 1:
-        args["n_device"] = len([f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
-        strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
-    elif args["no_cuda"]:
-        args["n_device"] = 1
-        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
-    else:
-        args["n_device"] = len(args["gpus"].split(","))
-        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args["gpus"].split(",")[0])
-
-    logging.warning(
-        "n_device: %s, distributed training: %s, 16-bits training: %s",
-        args["n_device"],
-        bool(args["n_device"] > 1),
-        args["fp16"],
-    )
-
-    labels = get_labels(args["labels"])
-    num_labels = len(labels) + 1
-    pad_token_label_id = 0
-    config = AutoConfig.from_pretrained(
-        args["config_name"] if args["config_name"] else args["model_name_or_path"],
-        num_labels=num_labels,
-        cache_dir=args["cache_dir"] if args["cache_dir"] else None,
-    )
-
-    logging.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args["do_train"]:
-        tokenizer = AutoTokenizer.from_pretrained(
-            args["tokenizer_name"] if args["tokenizer_name"] else args["model_name_or_path"],
-            do_lower_case=args["do_lower_case"],
-            cache_dir=args["cache_dir"] if args["cache_dir"] else None,
-        )
-
-        with strategy.scope():
-            model = TFAutoModelForTokenClassification.from_pretrained(
-                args["model_name_or_path"],
-                from_pt=bool(".bin" in args["model_name_or_path"]),
-                config=config,
-                cache_dir=args["cache_dir"] if args["cache_dir"] else None,
-            )
-            model.layers[-1].activation = tf.keras.activations.softmax
-
-        train_batch_size = args["per_device_train_batch_size"] * args["n_device"]
-        train_dataset, num_train_examples = load_and_cache_examples(
-            args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train"
-        )
-        train_dataset = strategy.experimental_distribute_dataset(train_dataset)
-        train(
-            args,
-            strategy,
-            train_dataset,
-            tokenizer,
-            model,
-            num_train_examples,
-            labels,
-            train_batch_size,
-            pad_token_label_id,
-        )
-
-        if not os.path.exists(args["output_dir"]):
-            os.makedirs(args["output_dir"])
-
-        logging.info("Saving model to %s", args["output_dir"])
-
-        model.save_pretrained(args["output_dir"])
-        tokenizer.save_pretrained(args["output_dir"])
-
-    # Evaluation
-    if args["do_eval"]:
-        tokenizer = AutoTokenizer.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"])
-        checkpoints = []
-        results = []
-
-        if args["eval_all_checkpoints"]:
-            checkpoints = list(
-                os.path.dirname(c)
-                for c in sorted(
-                    glob.glob(args["output_dir"] + "/**/" + TF2_WEIGHTS_NAME, recursive=True),
-                    key=lambda f: int("".join(filter(str.isdigit, f)) or -1),
-                )
-            )
-
-        logging.info("Evaluate the following checkpoints: %s", checkpoints)
-
-        if len(checkpoints) == 0:
-            checkpoints.append(args["output_dir"])
-
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final"
-
-            with strategy.scope():
-                model = TFAutoModelForTokenClassification.from_pretrained(checkpoint)
-
-            y_true, y_pred, eval_loss = evaluate(
-                args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev"
-            )
-            report = metrics.classification_report(y_true, y_pred, digits=4)
-
-            if global_step:
-                results.append({global_step + "_report": report, global_step + "_loss": eval_loss})
-
-        output_eval_file = os.path.join(args["output_dir"], "eval_results.txt")
-
-        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
-            for res in results:
-                for key, val in res.items():
-                    if "loss" in key:
-                        logging.info(key + " = " + str(val))
-                        writer.write(key + " = " + str(val))
-                        writer.write("\n")
-                    else:
-                        logging.info(key)
-                        logging.info("\n" + report)
-                        writer.write(key + "\n")
-                        writer.write(report)
-                        writer.write("\n")
-
-    if args["do_predict"]:
-        tokenizer = AutoTokenizer.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"])
-        model = TFAutoModelForTokenClassification.from_pretrained(args["output_dir"])
-        eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"]
-        predict_dataset, _ = load_and_cache_examples(
-            args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test"
-        )
-        y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test")
-        output_test_results_file = os.path.join(args["output_dir"], "test_results.txt")
-        output_test_predictions_file = os.path.join(args["output_dir"], "test_predictions.txt")
-        report = metrics.classification_report(y_true, y_pred, digits=4)
-
-        with tf.io.gfile.GFile(output_test_results_file, "w") as writer:
-            report = metrics.classification_report(y_true, y_pred, digits=4)
-
-            logging.info("\n" + report)
-
-            writer.write(report)
-            writer.write("\n\nloss = " + str(pred_loss))
-
-        with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer:
-            with tf.io.gfile.GFile(os.path.join(args["data_dir"], "test.txt"), "r") as f:
-                example_id = 0
-
-                for line in f:
-                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
-                        writer.write(line)
-
-                        if not y_pred[example_id]:
-                            example_id += 1
-                    elif y_pred[example_id]:
-                        output_line = line.split()[0] + " " + y_pred[example_id].pop(0) + "\n"
-                        writer.write(output_line)
-                    else:
-                        logging.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
-
-
-if __name__ == "__main__":
-    flags.mark_flag_as_required("data_dir")
-    flags.mark_flag_as_required("output_dir")
-    flags.mark_flag_as_required("model_name_or_path")
-    flags.mark_flag_as_required("model_type")
-    app.run(main)
--- a/examples/ner/utils_ner.py
+++ b/examples/ner/utils_ner.py
@@ -1,207 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Named entity recognition fine-tuning: utilities to work with CoNLL-2003 task. """
-
-
-import logging
-import os
-
-
-logger = logging.getLogger(__name__)
-
-
-class InputExample(object):
-    """A single training/test example for token classification."""
-
-    def __init__(self, guid, words, labels):
-        """Constructs a InputExample.
-
-        Args:
-            guid: Unique id for the example.
-            words: list. The words of the sequence.
-            labels: (Optional) list. The labels for each word of the sequence. This should be
-            specified for train and dev examples, but not for test examples.
-        """
-        self.guid = guid
-        self.words = words
-        self.labels = labels
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, input_ids, input_mask, segment_ids, label_ids):
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.label_ids = label_ids
-
-
-def read_examples_from_file(data_dir, mode):
-    file_path = os.path.join(data_dir, "{}.txt".format(mode))
-    guid_index = 1
-    examples = []
-    with open(file_path, encoding="utf-8") as f:
-        words = []
-        labels = []
-        for line in f:
-            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
-                if words:
-                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels))
-                    guid_index += 1
-                    words = []
-                    labels = []
-            else:
-                splits = line.split(" ")
-                words.append(splits[0])
-                if len(splits) > 1:
-                    labels.append(splits[-1].replace("\n", ""))
-                else:
-                    # Examples could have no label for mode = "test"
-                    labels.append("O")
-        if words:
-            examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels))
-    return examples
-
-
-def convert_examples_to_features(
-    examples,
-    label_list,
-    max_seq_length,
-    tokenizer,
-    cls_token_at_end=False,
-    cls_token="[CLS]",
-    cls_token_segment_id=1,
-    sep_token="[SEP]",
-    sep_token_extra=False,
-    pad_on_left=False,
-    pad_token=0,
-    pad_token_segment_id=0,
-    pad_token_label_id=-100,
-    sequence_a_segment_id=0,
-    mask_padding_with_zero=True,
-):
-    """ Loads a data file into a list of `InputBatch`s
-        `cls_token_at_end` define the location of the CLS token:
-            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
-    """
-
-    label_map = {label: i for i, label in enumerate(label_list)}
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        if ex_index % 10000 == 0:
-            logger.info("Writing example %d of %d", ex_index, len(examples))
-
-        tokens = []
-        label_ids = []
-        for word, label in zip(example.words, example.labels):
-            word_tokens = tokenizer.tokenize(word)
-            tokens.extend(word_tokens)
-            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
-            label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
-
-        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
-        special_tokens_count = 3 if sep_token_extra else 2
-        if len(tokens) > max_seq_length - special_tokens_count:
-            tokens = tokens[: (max_seq_length - special_tokens_count)]
-            label_ids = label_ids[: (max_seq_length - special_tokens_count)]
-
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids:   0   0   0   0  0     0   0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens += [sep_token]
-        label_ids += [pad_token_label_id]
-        if sep_token_extra:
-            # roberta uses an extra separator b/w pairs of sentences
-            tokens += [sep_token]
-            label_ids += [pad_token_label_id]
-        segment_ids = [sequence_a_segment_id] * len(tokens)
-
-        if cls_token_at_end:
-            tokens += [cls_token]
-            label_ids += [pad_token_label_id]
-            segment_ids += [cls_token_segment_id]
-        else:
-            tokens = [cls_token] + tokens
-            label_ids = [pad_token_label_id] + label_ids
-            segment_ids = [cls_token_segment_id] + segment_ids
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        padding_length = max_seq_length - len(input_ids)
-        if pad_on_left:
-            input_ids = ([pad_token] * padding_length) + input_ids
-            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
-            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
-            label_ids = ([pad_token_label_id] * padding_length) + label_ids
-        else:
-            input_ids += [pad_token] * padding_length
-            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
-            segment_ids += [pad_token_segment_id] * padding_length
-            label_ids += [pad_token_label_id] * padding_length
-
-        assert len(input_ids) == max_seq_length
-        assert len(input_mask) == max_seq_length
-        assert len(segment_ids) == max_seq_length
-        assert len(label_ids) == max_seq_length
-
-        if ex_index < 5:
-            logger.info("*** Example ***")
-            logger.info("guid: %s", example.guid)
-            logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
-            logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
-            logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
-            logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
-            logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))
-
-        features.append(
-            InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids)
-        )
-    return features
-
-
-def get_labels(path):
-    if path:
-        with open(path, "r") as f:
-            labels = f.read().splitlines()
-        if "O" not in labels:
-            labels = ["O"] + labels
-        return labels
-    else:
-        return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
--- a/examples/question-answering/README.md
+++ b/examples/question-answering/README.md
@@ -0,0 +1,181 @@
+
+
+## SQuAD
+
+Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/question-answering/run_squad.py).
+
+#### Fine-tuning BERT on SQuAD1.0
+
+This example code fine-tunes BERT on the SQuAD1.0 dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large)
+on a single tesla V100 16GB. The data for SQuAD can be downloaded with the following links and should be saved in a
+$SQUAD_DIR directory.
+
+* [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+* [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+* [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
+
+And for SQuAD2.0, you need to download:
+
+- [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
+- [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
+- [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
+
+```bash
+export SQUAD_DIR=/path/to/SQUAD
+
+python run_squad.py \
+  --model_type bert \
+  --model_name_or_path bert-base-uncased \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --train_file $SQUAD_DIR/train-v1.1.json \
+  --predict_file $SQUAD_DIR/dev-v1.1.json \
+  --per_gpu_train_batch_size 12 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2.0 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --output_dir /tmp/debug_squad/
+```
+
+Training with the previously defined hyper-parameters yields the following results:
+
+```bash
+f1 = 88.52
+exact_match = 81.22
+```
+
+#### Distributed training
+
+
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 ./examples/question-answering/run_squad.py \
+    --model_type bert \
+    --model_name_or_path bert-large-uncased-whole-word-masking \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --train_file $SQUAD_DIR/train-v1.1.json \
+    --predict_file $SQUAD_DIR/dev-v1.1.json \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir ./examples/models/wwm_uncased_finetuned_squad/ \
+    --per_gpu_eval_batch_size=3   \
+    --per_gpu_train_batch_size=3   \
+```
+
+Training with the previously defined hyper-parameters yields the following results:
+
+```bash
+f1 = 93.15
+exact_match = 86.91
+```
+
+This fine-tuned model is available as a checkpoint under the reference
+`bert-large-uncased-whole-word-masking-finetuned-squad`.
+
+#### Fine-tuning XLNet on SQuAD
+
+This example code fine-tunes XLNet on both SQuAD1.0 and SQuAD2.0 dataset. See above to download the data for SQuAD .
+
+##### Command for SQuAD1.0:
+
+```bash
+export SQUAD_DIR=/path/to/SQUAD
+
+python run_squad.py \
+    --model_type xlnet \
+    --model_name_or_path xlnet-large-cased \
+    --do_train \
+    --do_eval \
+    --train_file $SQUAD_DIR/train-v1.1.json \
+    --predict_file $SQUAD_DIR/dev-v1.1.json \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir ./wwm_cased_finetuned_squad/ \
+    --per_gpu_eval_batch_size=4  \
+    --per_gpu_train_batch_size=4   \
+    --save_steps 5000
+```
+
+##### Command for SQuAD2.0:
+
+```bash
+export SQUAD_DIR=/path/to/SQUAD
+
+python run_squad.py \
+    --model_type xlnet \
+    --model_name_or_path xlnet-large-cased \
+    --do_train \
+    --do_eval \
+    --version_2_with_negative \
+    --train_file $SQUAD_DIR/train-v2.0.json \
+    --predict_file $SQUAD_DIR/dev-v2.0.json \
+    --learning_rate 3e-5 \
+    --num_train_epochs 4 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir ./wwm_cased_finetuned_squad/ \
+    --per_gpu_eval_batch_size=2  \
+    --per_gpu_train_batch_size=2   \
+    --save_steps 5000
+```
+
+Larger batch size may improve the performance while costing more memory.
+
+##### Results for SQuAD1.0 with the previously defined hyper-parameters:
+
+```python
+{
+"exact": 85.45884578997162,
+"f1": 92.5974600601065,
+"total": 10570,
+"HasAns_exact": 85.45884578997162,
+"HasAns_f1": 92.59746006010651,
+"HasAns_total": 10570
+}
+```
+
+##### Results for SQuAD2.0 with the previously defined hyper-parameters:
+
+```python
+{
+"exact": 80.4177545691906,
+"f1": 84.07154997729623,
+"total": 11873,
+"HasAns_exact": 76.73751686909581,
+"HasAns_f1": 84.05558584352873,
+"HasAns_total": 5928,
+"NoAns_exact": 84.0874684608915,
+"NoAns_f1": 84.0874684608915,
+"NoAns_total": 5945
+}
+```
+
+## SQuAD with the Tensorflow Trainer
+
+```bash
+python run_tf_squad.py \
+    --model_name_or_path bert-base-uncased \
+    --output_dir model \
+    --max-seq-length 384 \
+    --num_train_epochs 2 \
+    --per_gpu_train_batch_size 8 \
+    --per_gpu_eval_batch_size 16 \
+    --do_train \
+    --logging_dir logs \
+    --mode question-answering \
+    --logging_steps 10 \
+    --learning_rate 3e-5 \
+    --doc_stride 128 \
+    --optimizer_name adamw
+```
+
+For the moment the evaluation is not available in the Tensorflow Trainer only the training.
--- a/examples/question-answering/run_squad.py
+++ b/examples/question-answering/run_squad.py
@@ -58,8 +58,6 @@ logger = logging.getLogger(__name__)
 MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in MODEL_CONFIG_CLASSES), (),)
-

 def set_seed(args):
    random.seed(args.seed)
@@ -307,7 +305,7 @@ def evaluate(args, model, tokenizer, prefix=""):
            if args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
                del inputs["token_type_ids"]

-            example_indices = batch[3]
+            feature_indices = batch[3]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
@@ -320,8 +318,9 @@ def evaluate(args, model, tokenizer, prefix=""):

            outputs = model(**inputs)

-        for i, example_index in enumerate(example_indices):
-            eval_feature = features[example_index.item()]
+        for i, feature_index in enumerate(feature_indices):
+            # TODO: i and feature_index are the same number! Simplify by removing enumerate?
+            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]
@@ -490,7 +489,7 @@ def main():
        default=None,
        type=str,
        required=True,
-        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--output_dir",
--- a/examples/question-answering/run_tf_squad.py
+++ b/examples/question-answering/run_tf_squad.py
@@ -0,0 +1,237 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Fine-tuning the library models for question-answering."""
+
+
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    HfArgumentParser,
+    TFAutoModelForQuestionAnswering,
+    TFTrainer,
+    TFTrainingArguments,
+    squad_convert_examples_to_features,
+)
+from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."})
+    # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
+    # or just modify its tokenizer_config.json.
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    data_dir: Optional[str] = field(
+        default=None, metadata={"help": "The input data dir. Should contain the .json files for the SQuAD task."}
+    )
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    doc_stride: int = field(
+        default=128,
+        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
+    )
+    max_query_length: int = field(
+        default=64,
+        metadata={
+            "help": "The maximum number of tokens for the question. Questions longer than this will "
+            "be truncated to this length."
+        },
+    )
+    max_answer_length: int = field(
+        default=30,
+        metadata={
+            "help": "The maximum length of an answer that can be generated. This is needed because the start "
+            "and end predictions are not conditioned on one another."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    version_2_with_negative: bool = field(
+        default=False, metadata={"help": "If true, the SQuAD examples contain some that do not have an answer."}
+    )
+    null_score_diff_threshold: float = field(
+        default=0.0, metadata={"help": "If null_score - best_non_null is greater than the threshold predict null."}
+    )
+    n_best_size: int = field(
+        default=20, metadata={"help": "If null_score - best_non_null is greater than the threshold predict null."}
+    )
+    lang_id: int = field(
+        default=0,
+        metadata={
+            "help": "language id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)"
+        },
+    )
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
+        )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(
+        "n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        training_args.n_gpu,
+        bool(training_args.n_gpu > 1),
+        training_args.fp16,
+    )
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Prepare Question-Answering task
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast,
+    )
+
+    with training_args.strategy.scope():
+        model = TFAutoModelForQuestionAnswering.from_pretrained(
+            model_args.model_name_or_path,
+            from_pt=bool(".bin" in model_args.model_name_or_path),
+            config=config,
+            cache_dir=model_args.cache_dir,
+        )
+
+    # Get datasets
+    if not data_args.data_dir:
+        if data_args.version_2_with_negative:
+            logger.warn("tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically")
+
+        try:
+            import tensorflow_datasets as tfds
+        except ImportError:
+            raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")
+
+        tfds_examples = tfds.load("squad")
+        train_examples = (
+            SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=False)
+            if training_args.do_train
+            else None
+        )
+        eval_examples = (
+            SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=True)
+            if training_args.do_eval
+            else None
+        )
+    else:
+        processor = SquadV2Processor() if data_args.version_2_with_negative else SquadV1Processor()
+        train_examples = processor.get_train_examples(data_args.data_dir) if training_args.do_train else None
+        eval_examples = processor.get_dev_examples(data_args.data_dir) if training_args.do_eval else None
+
+    train_dataset = (
+        squad_convert_examples_to_features(
+            examples=train_examples,
+            tokenizer=tokenizer,
+            max_seq_length=data_args.max_seq_length,
+            doc_stride=data_args.doc_stride,
+            max_query_length=data_args.max_query_length,
+            is_training=True,
+            return_dataset="tf",
+        )
+        if training_args.do_train
+        else None
+    )
+
+    eval_dataset = (
+        squad_convert_examples_to_features(
+            examples=eval_examples,
+            tokenizer=tokenizer,
+            max_seq_length=data_args.max_seq_length,
+            doc_stride=data_args.doc_stride,
+            max_query_length=data_args.max_query_length,
+            is_training=False,
+            return_dataset="tf",
+        )
+        if training_args.do_eval
+        else None
+    )
+
+    # Initialize our Trainer
+    trainer = TFTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,)
+
+    # Training
+    if training_args.do_train:
+        trainer.train()
+        trainer.save_model()
+        tokenizer.save_pretrained(training_args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@@ -1,5 +1,9 @@
-tensorboardX
 tensorboard
 scikit-learn
 seqeval
 psutil
+sacrebleu
+rouge-score
+tensorflow_datasets
+pytorch-lightning==0.7.3  # April 10, 2020 release
+matplotlib
--- a/examples/run_language_modeling.py
+++ b/examples/run_language_modeling.py
@@ -1,778 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
-GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
-using a masked language modeling (MLM) loss.
-"""
-
-
-import argparse
-import glob
-import logging
-import os
-import pickle
-import random
-import re
-import shutil
-from typing import Dict, List, Tuple
-
-import numpy as np
-import torch
-from torch.nn.utils.rnn import pad_sequence
-from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from transformers import (
-    CONFIG_MAPPING,
-    MODEL_WITH_LM_HEAD_MAPPING,
-    WEIGHTS_NAME,
-    AdamW,
-    AutoConfig,
-    AutoModelWithLMHead,
-    AutoTokenizer,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    get_linear_schedule_with_warmup,
-)
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-
-MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-class TextDataset(Dataset):
-    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
-        assert os.path.isfile(file_path)
-
-        block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)
-
-        directory, filename = os.path.split(file_path)
-        cached_features_file = os.path.join(
-            directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + filename
-        )
-
-        if os.path.exists(cached_features_file) and not args.overwrite_cache:
-            logger.info("Loading features from cached file %s", cached_features_file)
-            with open(cached_features_file, "rb") as handle:
-                self.examples = pickle.load(handle)
-        else:
-            logger.info("Creating features from dataset file at %s", directory)
-
-            self.examples = []
-            with open(file_path, encoding="utf-8") as f:
-                text = f.read()
-
-            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
-
-            for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
-                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))
-            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
-            # If your dataset is small, first you should loook for a bigger one :-) and second you
-            # can change this behavior by adding (model specific) padding.
-
-            logger.info("Saving features into cached file %s", cached_features_file)
-            with open(cached_features_file, "wb") as handle:
-                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
-
-    def __len__(self):
-        return len(self.examples)
-
-    def __getitem__(self, item):
-        return torch.tensor(self.examples[item], dtype=torch.long)
-
-
-class LineByLineTextDataset(Dataset):
-    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
-        assert os.path.isfile(file_path)
-        # Here, we do not cache the features, operating under the assumption
-        # that we will soon use fast multithreaded tokenizers from the
-        # `tokenizers` repo everywhere =)
-        logger.info("Creating features from dataset file at %s", file_path)
-
-        with open(file_path, encoding="utf-8") as f:
-            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
-
-        self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]
-
-    def __len__(self):
-        return len(self.examples)
-
-    def __getitem__(self, i):
-        return torch.tensor(self.examples[i], dtype=torch.long)
-
-
-def load_and_cache_examples(args, tokenizer, evaluate=False):
-    file_path = args.eval_data_file if evaluate else args.train_data_file
-    if args.line_by_line:
-        return LineByLineTextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size)
-    else:
-        return TextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size)
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
-    ordering_and_checkpoint_path = []
-
-    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))
-
-    for path in glob_checkpoints:
-        if use_mtime:
-            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
-        else:
-            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
-            if regex_match and regex_match.groups():
-                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
-
-    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
-    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
-    return checkpoints_sorted
-
-
-def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
-    if not args.save_total_limit:
-        return
-    if args.save_total_limit <= 0:
-        return
-
-    # Check if we should delete older checkpoint(s)
-    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
-    if len(checkpoints_sorted) <= args.save_total_limit:
-        return
-
-    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
-    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
-    for checkpoint in checkpoints_to_be_deleted:
-        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
-        shutil.rmtree(checkpoint)
-
-
-def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]:
-    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
-
-    if tokenizer.mask_token is None:
-        raise ValueError(
-            "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
-        )
-
-    labels = inputs.clone()
-    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
-    probability_matrix = torch.full(labels.shape, args.mlm_probability)
-    special_tokens_mask = [
-        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
-    ]
-    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
-    if tokenizer._pad_token is not None:
-        padding_mask = labels.eq(tokenizer.pad_token_id)
-        probability_matrix.masked_fill_(padding_mask, value=0.0)
-    masked_indices = torch.bernoulli(probability_matrix).bool()
-    labels[~masked_indices] = -100  # We only compute loss on masked tokens
-
-    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
-    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
-
-    # 10% of the time, we replace masked input tokens with random word
-    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
-    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
-    inputs[indices_random] = random_words[indices_random]
-
-    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
-    return inputs, labels
-
-
-def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
-    """ Train the model """
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-
-    def collate(examples: List[torch.Tensor]):
-        if tokenizer._pad_token is None:
-            return pad_sequence(examples, batch_first=True)
-        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
-
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(
-        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate
-    )
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-
-    # Check if saved optimizer or scheduler states exist
-    if (
-        args.model_name_or_path
-        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
-        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
-    ):
-        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
-
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    epochs_trained = 0
-    steps_trained_in_current_epoch = 0
-    # Check if continuing training from a checkpoint
-    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
-        try:
-            # set global_step to gobal_step of last saved checkpoint from model path
-            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
-            global_step = int(checkpoint_suffix)
-            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
-            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
-
-            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-            logger.info("  Continuing training from epoch %d", epochs_trained)
-            logger.info("  Continuing training from global step %d", global_step)
-            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
-        except ValueError:
-            logger.info("  Starting fine-tuning.")
-
-    tr_loss, logging_loss = 0.0, 0.0
-
-    model_to_resize = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
-    model_to_resize.resize_token_embeddings(len(tokenizer))
-
-    model.zero_grad()
-    train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
-    )
-    set_seed(args)  # Added here for reproducibility
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-
-            # Skip past any already trained steps if resuming training
-            if steps_trained_in_current_epoch > 0:
-                steps_trained_in_current_epoch -= 1
-                continue
-
-            inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
-            inputs = inputs.to(args.device)
-            labels = labels.to(args.device)
-            model.train()
-            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
-            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Log metrics
-                    if (
-                        args.local_rank == -1 and args.evaluate_during_training
-                    ):  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
-                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
-                    logging_loss = tr_loss
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    checkpoint_prefix = "checkpoint"
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
-                    os.makedirs(output_dir, exist_ok=True)
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    tokenizer.save_pretrained(output_dir)
-
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-                    _rotate_checkpoints(args, checkpoint_prefix)
-
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
-                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict:
-    # Loop to handle MNLI double evaluation (matched, mis-matched)
-    eval_output_dir = args.output_dir
-
-    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
-
-    if args.local_rank in [-1, 0]:
-        os.makedirs(eval_output_dir, exist_ok=True)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-    # Note that DistributedSampler samples randomly
-
-    def collate(examples: List[torch.Tensor]):
-        if tokenizer._pad_token is None:
-            return pad_sequence(examples, batch_first=True)
-        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
-
-    eval_sampler = SequentialSampler(eval_dataset)
-    eval_dataloader = DataLoader(
-        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate
-    )
-
-    # multi-gpu evaluate
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Eval!
-    logger.info("***** Running evaluation {} *****".format(prefix))
-    logger.info("  Num examples = %d", len(eval_dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-    eval_loss = 0.0
-    nb_eval_steps = 0
-    model.eval()
-
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
-        inputs = inputs.to(args.device)
-        labels = labels.to(args.device)
-
-        with torch.no_grad():
-            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
-            lm_loss = outputs[0]
-            eval_loss += lm_loss.mean().item()
-        nb_eval_steps += 1
-
-    eval_loss = eval_loss / nb_eval_steps
-    perplexity = torch.exp(torch.tensor(eval_loss))
-
-    result = {"perplexity": perplexity}
-
-    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
-    with open(output_eval_file, "w") as writer:
-        logger.info("***** Eval results {} *****".format(prefix))
-        for key in sorted(result.keys()):
-            logger.info("  %s = %s", key, str(result[key]))
-            writer.write("%s = %s\n" % (key, str(result[key])))
-
-    return result
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)."
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--model_type", type=str, required=True, help="The model architecture to be trained or fine-tuned.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--eval_data_file",
-        default=None,
-        type=str,
-        help="An optional input evaluation data file to evaluate the perplexity on (a text file).",
-    )
-    parser.add_argument(
-        "--line_by_line",
-        action="store_true",
-        help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.",
-    )
-    parser.add_argument(
-        "--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir"
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        help="The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.",
-    )
-
-    parser.add_argument(
-        "--mlm", action="store_true", help="Train with masked-language modeling loss instead of language modeling."
-    )
-    parser.add_argument(
-        "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss"
-    )
-
-    parser.add_argument(
-        "--config_name",
-        default=None,
-        type=str,
-        help="Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default=None,
-        type=str,
-        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default=None,
-        type=str,
-        help="Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)",
-    )
-    parser.add_argument(
-        "--block_size",
-        default=-1,
-        type=int,
-        help="Optional input sequence length after tokenization."
-        "The training dataset will be truncated in block of this size for training."
-        "Default to the model max input length for single sentence inputs (take into account special tokens).",
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
-    )
-
-    parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-
-    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--save_total_limit",
-        type=int,
-        default=None,
-        help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
-    )
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-        "See details at https://nvidia.github.io/apex/amp.html",
-    )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
-    args = parser.parse_args()
-
-    if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
-        raise ValueError(
-            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
-            "flag (masked language modeling)."
-        )
-    if args.eval_data_file is None and args.do_eval:
-        raise ValueError(
-            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
-            "or remove the --do_eval argument."
-        )
-    if args.should_continue:
-        sorted_checkpoints = _sorted_checkpoints(args)
-        if len(sorted_checkpoints) == 0:
-            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
-        else:
-            args.model_name_or_path = sorted_checkpoints[-1]
-
-    if (
-        os.path.exists(args.output_dir)
-        and os.listdir(args.output_dir)
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
-                args.output_dir
-            )
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-
-    # Set seed
-    set_seed(args)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab
-
-    if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
-    elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
-    else:
-        config = CONFIG_MAPPING[args.model_type]()
-
-    if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
-    elif args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
-    else:
-        raise ValueError(
-            "You are instantiating a new {} tokenizer. This is not supported, but you can do it from another script, save it,"
-            "and load it from here, using --tokenizer_name".format(AutoTokenizer.__name__)
-        )
-
-    if args.block_size <= 0:
-        args.block_size = tokenizer.max_len
-        # Our input block size will be the max possible for the model
-    else:
-        args.block_size = min(args.block_size, tokenizer.max_len)
-
-    if args.model_name_or_path:
-        model = AutoModelWithLMHead.from_pretrained(
-            args.model_name_or_path,
-            from_tf=bool(".ckpt" in args.model_name_or_path),
-            config=config,
-            cache_dir=args.cache_dir,
-        )
-    else:
-        logger.info("Training new model from scratch")
-        model = AutoModelWithLMHead(config=config)
-
-    model.to(args.device)
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # End of barrier to make sure only the first process in distributed training download model & vocab
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args.do_train:
-        if args.local_rank not in [-1, 0]:
-            torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)
-
-        if args.local_rank == 0:
-            torch.distributed.barrier()
-
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir, exist_ok=True)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = AutoModelWithLMHead.from_pretrained(args.output_dir)
-        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
-        model.to(args.device)
-
-    # Evaluation
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = list(
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            )
-            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
-
-            model = AutoModelWithLMHead.from_pretrained(checkpoint)
-            model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
-            results.update(result)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
--- a/Show More
+++ b/Show More