Release: v2.5.0

Welcome Rust Tokenizers
Integrate fast tokenizers library inside transformers (#2674 )
2020-02-19 11:46:19 -05:00 · 2020-02-19 11:35:40 -05:00 · 2020-02-19 10:51:16 -05:00 · 2020-02-18 16:14:50 -05:00 · 2020-02-18 16:17:35 +00:00 · 2020-02-17 20:19:57 +00:00
373 changed files with 62624 additions and 22885 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,87 +1,121 @@
 version: 2
 jobs:
-    build_py3_torch_and_tf:
+    run_tests_torch_and_tf:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
+        environment:
+            OMP_NUM_THREADS: 1
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install torch
-            - run: sudo pip install tensorflow==2.0.0-rc0
-            - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest codecov pytest-cov
-            - run: sudo pip install tensorboardX scikit-learn
-            - run: python -m pytest -sv ./transformers/tests/ --cov
+            - run: sudo pip install .[sklearn,tf-cpu,torch,testing]
+            - run: sudo pip install codecov pytest-cov
+            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
            - run: codecov
-    build_py3_torch:
+    run_all_tests_torch_and_tf:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
+        environment:
+            OMP_NUM_THREADS: 1
+            RUN_SLOW: yes
+            RUN_CUSTOM_TOKENIZERS: yes
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install torch
-            - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest codecov pytest-cov
-            - run: sudo pip install tensorboardX scikit-learn
-            - run: python -m pytest -sv ./transformers/tests/ --cov
-            - run: python -m pytest -sv ./examples/
-            - run: codecov
-    build_py3_tf:
+            - run: sudo pip install .[mecab,sklearn,tf-cpu,torch,testing]
+            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/
+            - no_output_timeout: 4h
+
+    run_tests_torch:
        working_directory: ~/transformers
        docker:
-            - image: circleci/python:3.5
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install tensorflow==2.0.0-rc0
-            - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest codecov pytest-cov
-            - run: sudo pip install tensorboardX scikit-learn
-            - run: python -m pytest -sv ./transformers/tests/ --cov
+            - run: sudo pip install .[sklearn,torch,testing]
+            - run: sudo pip install codecov pytest-cov
+            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
            - run: codecov
-    build_py2_torch:
+    run_tests_tf:
        working_directory: ~/transformers
-        resource_class: large
-        parallelism: 1
        docker:
-            - image: circleci/python:2.7
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+        resource_class: xlarge
+        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install torch
-            - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest codecov pytest-cov
-            - run: python -m pytest -sv ./transformers/tests/ --cov
+            - run: sudo pip install .[sklearn,tf-cpu,testing]
+            - run: sudo pip install codecov pytest-cov
+            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
            - run: codecov
-    build_py2_tf:
+    run_tests_custom_tokenizers:
        working_directory: ~/transformers
-        resource_class: large
-        parallelism: 1
        docker:
-            - image: circleci/python:2.7
+            - image: circleci/python:3.5
+        environment:
+            RUN_CUSTOM_TOKENIZERS: yes
        steps:
            - checkout
-            - run: sudo pip install tensorflow==2.0.0-rc0
-            - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest codecov pytest-cov
-            - run: python -m pytest -sv ./transformers/tests/ --cov
-            - run: codecov
+            - run: sudo pip install .[mecab,testing]
+            - run: python -m pytest -sv ./tests/test_tokenization_bert_japanese.py
+    run_examples_torch:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.5
+        environment:
+            OMP_NUM_THREADS: 1
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - run: sudo pip install .[sklearn,torch,testing]
+            - run: sudo pip install -r examples/requirements.txt
+            - run: python -m pytest -n 8 --dist=loadfile -s -v ./examples/
    deploy_doc:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        steps:
            - add_ssh_keys:
-                  fingerprints:
-                      - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
+                fingerprints:
+                    - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
            - checkout
-            - run: sudo pip install --progress-bar off -r docs/requirements.txt
-            - run: sudo pip install --progress-bar off -r requirements.txt
-            - run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
+            - run: sudo pip install .[tf,torch,docs]
+            - run: ./.circleci/deploy.sh
+    check_code_quality:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        resource_class: medium
+        parallelism: 1
+        steps:
+            - checkout
+            # we need a version of isort with https://github.com/timothycrosley/isort/pull/1000
+            - run: sudo pip install git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
+            - run: sudo pip install .[tf,torch,quality]
+            - run: black --check --line-length 119 --target-version py35 examples templates tests src utils
+            - run: isort --check-only --recursive examples templates tests src utils
+            - run: flake8 examples templates tests src utils
+    check_repository_consistency:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.5
+        resource_class: small
+        parallelism: 1
+        steps:
+            - checkout
+            - run: sudo pip install requests
+            - run: python ./utils/link_tester.py
 workflow_filters: &workflow_filters
    filters:
        branches:
@@ -91,9 +125,21 @@ workflows:
    version: 2
    build_and_test:
        jobs:
-            - build_py3_torch_and_tf
-            - build_py3_torch
-            - build_py3_tf
-            - build_py2_torch
-            - build_py2_tf
-            - deploy_doc: *workflow_filters
+            - check_code_quality
+            - check_repository_consistency
+            - run_examples_torch
+            - run_tests_custom_tokenizers
+            - run_tests_torch_and_tf
+            - run_tests_torch
+            - run_tests_tf
+            - deploy_doc: *workflow_filters
+    run_slow_tests:
+        triggers:
+            - schedule:
+                cron: "0 4 * * 1"
+                filters:
+                    branches:
+                        only:
+                            - master
+        jobs:
+            - run_all_tests_torch_and_tf
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -0,0 +1,28 @@
+cd docs
+
+function deploy_doc(){
+	echo "Creating doc at commit $1 and pushing to folder $2"
+	git checkout $1
+	if [ ! -z "$2" ]
+	then
+		if [ -d "$dir/$2" ]; then
+			echo "Directory" $2 "already exists"
+		else
+			echo "Pushing version" $2
+			make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
+		fi
+	else
+		echo "Pushing master"
+		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
+	fi
+}
+
+deploy_doc "master"
+deploy_doc "b33a385" v1.0.0
+deploy_doc "fe02e45" v1.1.0
+deploy_doc "89fd345" v1.2.0
+deploy_doc "fc9faa8" v2.0.0
+deploy_doc "3ddce1d" v2.1.1
+deploy_doc "3616209" v2.2.0
+deploy_doc "d0f8b9a" v2.3.0
+deploy_doc "6664ea9" v2.4.0
--- a/.github/ISSUE_TEMPLATE/---new-benchmark.md
+++ b/.github/ISSUE_TEMPLATE/---new-benchmark.md
@@ -0,0 +1,22 @@
+---
+name: "\U0001F5A5 New benchmark"
+about: Benchmark a part of this library and share your results
+title: "[Benchmark]"
+labels: ''
+assignees: ''
+
+---
+
+# 🖥 Benchmarking `transformers`
+
+## Benchmark
+
+Which part of `transformers` did you benchmark?
+
+## Set-up
+
+What did you run your benchmarks on? Please include details, such as: CPU, GPU? If using multiple GPUs, which parallelization did you use?
+
+## Results
+
+Put your results here!
--- a/.github/ISSUE_TEMPLATE/--new-model-addition.md
+++ b/.github/ISSUE_TEMPLATE/--new-model-addition.md
@@ -1,5 +1,5 @@
 ---
-name: "\U0001F31FNew model addition"
+name: "\U0001F31F New model addition"
 about: Submit a proposal/request to implement a new Transformer-based model
 title: ''
 labels: ''
@@ -7,17 +7,14 @@ assignees: ''

 ---

-# 🌟New model addition
+# 🌟 New model addition

 ## Model description

 <!-- Important information -->

-## Open Source status
+## Open source status

 * [ ] the model implementation is available: (give details)
 * [ ] the model weights are available: (give details)
-
-## Additional context
-
-<!-- Add any other context about the problem here. -->
+* [ ] who are the authors: (mention them, if possible by @gh-username)
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -1,29 +1,29 @@
 ---
 name: "\U0001F41B Bug Report"
-about: Submit a bug report to help us improve PyTorch Transformers
+about: Submit a bug report to help us improve transformers
 title: ''
 labels: ''
 assignees: ''

 ---

-## 🐛 Bug
+# 🐛 Bug

-<!-- Important information -->
+## Information

-Model I am using (Bert, XLNet....):
+Model I am using (Bert, XLNet ...):

-Language I am using the model on (English, Chinese....):
+Language I am using the model on (English, Chinese ...):

-The problem arise when using:
-* [ ] the official example scripts: (give details)
-* [ ] my own modified scripts: (give details)
+The problem arises when using:
+* [ ] the official example scripts: (give details below)
+* [ ] my own modified scripts: (give details below)

 The tasks I am working on is:
 * [ ] an official GLUE/SQUaD task: (give the name)
-* [ ] my own task or dataset: (give details)
+* [ ] my own task or dataset: (give details below)

-## To Reproduce
+## To reproduce

 Steps to reproduce the behavior:

@@ -31,22 +31,22 @@ Steps to reproduce the behavior:
 2.
 3.

-<!-- If you have a code sample, error messages, stack traces, please provide it here as well. -->
+<!-- If you have code snippets, error messages, stack traces please provide them here as well.
+     Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
+     Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.-->

 ## Expected behavior

-<!-- A clear and concise description of what you expected to happen. -->
+<!-- A clear and concise description of what you would expect to happen. -->

-## Environment
-
-* OS:
-* Python version:
-* PyTorch version:
-* PyTorch Transformers version (or branch):
-* Using GPU ?
-* Distributed of parallel setup ?
-* Any other relevant information:
-
-## Additional context
-
-<!-- Add any other context about the problem here. -->
+## Environment info
+<!-- You can run the command `python transformers-cli env` and copy-and-paste its output below.
+     Don't forget to fill out the missing fields in that output! -->
+     
+- `transformers` version:
+- Platform:
+- Python version:
+- PyTorch version (GPU?):
+- Tensorflow version (GPU?):
+- Using GPU in script?:
+- Using distributed or parallel set-up in script?:
--- a/.github/ISSUE_TEMPLATE/feature-request.md
+++ b/.github/ISSUE_TEMPLATE/feature-request.md
@@ -1,20 +1,25 @@
 ---
-name: "\U0001F680 Feature Request"
-about: Submit a proposal/request for a new PyTorch Transformers feature
+name: "\U0001F680 Feature request"
+about: Submit a proposal/request for a new transformers feature
 title: ''
 labels: ''
 assignees: ''

 ---

-## 🚀 Feature
+# 🚀 Feature request

-<!-- A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist. -->
+<!-- A clear and concise description of the feature proposal.
+     Please provide a link to the paper and code in case they exist. -->

 ## Motivation

-<!-- Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too. -->
+<!-- Please outline the motivation for the proposal. Is your feature request
+     related to a problem? e.g., I'm always frustrated when [...]. If this is related
+     to another GitHub issue, please link here too. -->

-## Additional context
+## Your contribution

-<!-- Add any other context or screenshots about the feature request here. -->
+<!-- Is there any way that you could help, e.g. by submitting a PR?
+     Make sure to read the CONTRIBUTING.MD readme:
+     https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md -->
--- a/.github/ISSUE_TEMPLATE/migration.md
+++ b/.github/ISSUE_TEMPLATE/migration.md
@@ -1,47 +1,57 @@
 ---
-name: "\U0001F4DA Migration from PyTorch-pretrained-Bert"
-about: Report a problem when migrating from PyTorch-pretrained-Bert to Transformers
+name: "\U0001F4DA Migration from pytorch-pretrained-bert or pytorch-transformers"
+about: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers to transformers
 title: ''
 labels: ''
 assignees: ''

 ---

-## 📚 Migration
+# 📚 Migration
+
+## Information

 <!-- Important information -->

-Model I am using (Bert, XLNet....):
+Model I am using (Bert, XLNet ...):

-Language I am using the model on (English, Chinese....):
+Language I am using the model on (English, Chinese ...):

-The problem arise when using:
-* [ ] the official example scripts: (give details)
-* [ ] my own modified scripts: (give details)
+The problem arises when using:
+* [ ] the official example scripts: (give details below)
+* [ ] my own modified scripts: (give details below)

 The tasks I am working on is:
 * [ ] an official GLUE/SQUaD task: (give the name)
-* [ ] my own task or dataset: (give details)
+* [ ] my own task or dataset: (give details below)

-Details of the issue:
+## Details

-<!-- A clear and concise description of the migration issue. If you have code snippets, please provide it here as well. -->
+<!-- A clear and concise description of the migration issue.
+    If you have code snippets, please provide it here as well.
+    Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
+    Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
+    -->

-## Environment
+## Environment info
+<!-- You can run the command `python transformers-cli env` and copy-and-paste its output below.
+     Don't forget to fill out the missing fields in that output! -->
+ 
+- `transformers` version:
+- Platform:
+- Python version:
+- PyTorch version (GPU?):
+- Tensorflow version (GPU?):
+- Using GPU in script?:
+- Using distributed or parallel set-up in script?:
+
+<!-- IMPORTANT: which version of the former library do you use? -->
+* `pytorch-transformers` or `pytorch-pretrained-bert` version (or branch):

-* OS:
-* Python version:
-* PyTorch version:
-* PyTorch Transformers version (or branch):
-* Using GPU ?
-* Distributed of parallel setup ?
-* Any other relevant information:

 ## Checklist

 - [ ] I have read the migration guide in the readme.
+ ([pytorch-transformers](https://github.com/huggingface/transformers#migrating-from-pytorch-transformers-to-transformers);
+  [pytorch-pretrained-bert](https://github.com/huggingface/transformers#migrating-from-pytorch-pretrained-bert-to-transformers))
 - [ ] I checked if a related official extension example runs on my machine.
-
-## Additional context
-
-<!-- Add any other context about the problem here. -->
--- a/.github/ISSUE_TEMPLATE/question-help.md
+++ b/.github/ISSUE_TEMPLATE/question-help.md
@@ -1,12 +1,29 @@
 ---
-name: "❓Questions & Help"
-about: Start a general discussion related to PyTorch Transformers
+name: "❓ Questions & Help"
+about: Post your general questions on Stack Overflow tagged huggingface-transformers
 title: ''
 labels: ''
 assignees: ''

 ---

-## ❓ Questions & Help
+# ❓ Questions & Help

-<!-- A clear and concise description of the question. -->
+<!-- The GitHub issue tracker is primarly intended for bugs, feature requests,
+     new models and benchmarks, and migration questions. For all other questions,
+     we direct you to Stack Overflow (SO) where a whole community of PyTorch and
+     Tensorflow enthusiast can help you out. Make sure to tag your question with the
+     right deep learning framework as well as the huggingface-transformers tag: 
+     https://stackoverflow.com/questions/tagged/huggingface-transformers 
+     
+     If your question wasn't answered after a period of time on Stack Overflow, you
+     can always open a question on GitHub. You should then link to the SO question 
+     that you posted.
+     -->
+
+## Details
+<!-- Description of your issue -->
+
+<!-- You should first ask your question on SO, and only if
+     you didn't get an answer ask it here on GitHub. -->
+**A link to original question on Stack Overflow**: 
--- a/.gitignore
+++ b/.gitignore
@@ -137,4 +137,8 @@ examples/runs
 serialization_dir

 # emacs
-*.*~
+*.*~
+debug.env
+
+# vim
+.*.swp
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -41,14 +41,10 @@ Did not find it? :( So we can act quickly on it, please follow these steps:
  less than 30s;
 * Provide the *full* traceback if an exception is raised.

-To get the OS and software versions, execute the following code and copy-paste
-the output:
+To get the OS and software versions automatically, you can run the following command:

-```
-import platform; print("Platform", platform.platform())
-import sys; print("Python", sys.version)
-import torch; print("PyTorch", torch.__version__)
-import tensorflow; print("Tensorflow", tensorflow.__version__)
+```bash
+python transformers-cli env
 ```

 ### Do you want to implement a new model?
@@ -62,6 +58,8 @@ Awesome! Please provide the following information:
 If you are willing to contribute the model yourself, let us know so we can best
 guide you.

+We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder.
+
 ### Do you want a new feature (that is not a model)?

 A world-class feature request addresses the following points:
@@ -81,6 +79,8 @@ A world-class feature request addresses the following points:
 If your issue is well written we're already 80% of the way there by the time you
 post it.

+We have added **templates** to guide you in the process of adding a new example script for training or testing the models in the library. You can find them in the [`templates`](./templates) folder.
+
 ## Start contributing! (Pull Requests)

 Before writing code, we strongly advise you to search through the exising PRs or
@@ -96,13 +96,14 @@ Follow these steps to start contributing:

 1. Fork the [repository](https://github.com/huggingface/transformers) by
   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
-   under your github user account.
+   under your GitHub user account.
+
 2. Clone your fork to your local disk, and add the base repository as a remote:
-   
+
   ```bash
   $ git clone git@github.com:<your Github handle>/transformers.git
   $ cd transformers
-   $ git remote add upstream git@github.com:huggingface/transformers.git
+   $ git remote add upstream https://github.com/huggingface/transformers.git
   ```

 3. Create a new branch to hold your development changes:
@@ -110,43 +111,78 @@ Follow these steps to start contributing:
   ```bash
   $ git checkout -b a-descriptive-name-for-my-changes
   ```
-   
+
   **do not** work on the `master` branch.
-   
+
 4. Set up a development environment by running the following command in a virtual environment:

   ```bash
-   $ pip install -r requirements-dev.txt
+   $ pip install -e ".[dev]"
   ```

-5. Develop the features on your branch. Add changed files using `git add` and
-   then `git commit` to record your changes locally:
-   
+   (If transformers was already installed in the virtual environment, remove
+   it with `pip uninstall transformers` before reinstalling it in editable
+   mode with the `-e` flag.)
+
+   Right now, we need an unreleased version of `isort` to avoid a
+   [bug](https://github.com/timothycrosley/isort/pull/1000):
+
+   ```bash
+   $ pip install -U git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
+   ```
+
+5. Develop the features on your branch.
+
+   As you work on the features, you should make sure that the test suite
+   passes:
+
+   ```bash
+   $ make test
+   ```
+
+   `transformers` relies on `black` and `isort` to format its source code
+   consistently. After you make changes, format them with:
+
+   ```bash
+   $ make style
+   ```
+
+   `transformers` also uses `flake8` to check for coding mistakes. Quality
+   control runs in CI, however you can also run the same checks with:
+
+   ```bash
+   $ make quality
+   ```
+
+   Once you're happy with your changes, add changed files using `git add` and
+   make a commit with `git commit` to record your changes locally:
+
   ```bash
   $ git add modified_file.py
   $ git commit
   ```
-   
+
   Please write [good commit
-   messages](https://chris.beams.io/posts/git-commit/). It
-   is a good idea to sync your copy of the code with the original repository
-   regularly. This way you can quickly account for changes:
-   
+   messages](https://chris.beams.io/posts/git-commit/).
+
+   It is a good idea to sync your copy of the code with the original
+   repository regularly. This way you can quickly account for changes:
+
   ```bash
   $ git fetch upstream
   $ git rebase upstream/master
   ```
-   
+
   Push the changes to your account using:
-   
+
   ```bash
   $ git push -u origin a-descriptive-name-for-my-changes
   ```
-   
+
 6. Once you are satisfied (**and the checklist below is happy too**), go to the
-   webpage of your fork on Github. Click on 'Pull request' to send your changes
+   webpage of your fork on GitHub. Click on 'Pull request' to send your changes
   to the project maintainers for review.
-   
+
 7. It's ok if maintainers ask you for changes. It happens to core contributors
   too! So everyone can see the changes in the Pull request, work in your local
   branch and push the changes to your fork. They will automatically appear in
@@ -162,9 +198,58 @@ Follow these steps to start contributing:
 3. To indicate a work in progress please prefix the title with `[WIP]`. These
   are useful to avoid duplicated work, and to differentiate it from PRs ready
   to be merged;
-4. Make sure pre-existing tests still pass;
-5. Add high-coverage tests. No quality test, no merge;
-6. All public methods must have informative doctrings;
+4. Make sure existing tests pass;
+5. Add high-coverage tests. No quality test, no merge. 
+ - If you are adding a new model, make sure that you use `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)`, which triggers the common tests.
+ - If you are adding new `@slow` tests, make sure they pass using `RUN_SLOW=1 python -m pytest tests/test_my_new_model.py`. 
+CircleCI does not run them. 
+6. All public methods must have informative docstrings;
+
+### Tests
+
+You can run 🤗 Transformers tests with `unittest` or `pytest`.
+
+We like `pytest` and `pytest-xdist` because it's faster. From the root of the
+repository, here's how to run tests with `pytest` for the library:
+
+```bash
+$ python -m pytest -n auto --dist=loadfile -s -v ./tests/
+```
+
+and for the examples:
+
+```bash
+$ pip install -r examples/requirements.txt  # only needed the first time
+$ python -m pytest -n auto --dist=loadfile -s -v ./examples/
+```
+
+In fact, that's how `make test` and `make test-examples` are implemented!
+
+You can specify a smaller set of tests in order to test only the feature
+you're working on.
+
+By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to
+`yes` to run them. This will download many gigabytes of models — make sure you
+have enough disk space and a good Internet connection, or a lot of patience!
+
+```bash
+$ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/
+$ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/
+```
+
+Likewise, set the `RUN_CUSTOM_TOKENIZERS` environment variable to `yes` to run
+tests for custom tokenizers, which don't run by default either.
+
+🤗 Transformers uses `pytest` as a test runner only. It doesn't use any
+`pytest`-specific features in the test suite itself.
+
+This means `unittest` is fully supported. Here's how to run tests with
+`unittest`:
+
+```bash
+$ python -m unittest discover -s tests -t . -v
+$ python -m unittest discover -s examples -t examples -v
+```


 ### Style guide
--- a/24
+++ b/24
@@ -0,0 +1,24 @@
+.PHONY: quality style test test-examples
+
+# Check that source code meets quality standards
+
+quality:
+	black --check --line-length 119 --target-version py35 examples templates tests src utils
+	isort --check-only --recursive examples templates tests src utils
+	flake8 examples templates tests src utils
+
+# Format source code automatically
+
+style:
+	black --line-length 119 --target-version py35 examples templates tests src utils
+	isort --recursive examples templates tests src utils
+
+# Run tests for the library
+
+test:
+	python -m pytest -n auto --dist=loadfile -s -v ./tests/
+
+# Run tests for examples
+
+test-examples:
+	python -m pytest -n auto --dist=loadfile -s -v ./examples/
--- a/README.md
+++ b/README.md
@@ -24,6 +24,8 @@

 🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet, CTRL...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.

+[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/0)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/0)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/1)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/1)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/2)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/2)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/3)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/3)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/4)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/4)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/5)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/5)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/6)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/6)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/7)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/7)
+
 ### Features

 - As easy to use as pytorch-transformers
@@ -39,7 +41,7 @@ State-of-the-art NLP for everyone
 Lower compute costs, smaller carbon footprint
 - Researchers can share trained models instead of always retraining
 - Practitioners can reduce compute time and production costs
- 8 architectures with over 30 pretrained models, some in more than 100 languages
+- 10 architectures with over 30 pretrained models, some in more than 100 languages

 Choose the right framework for every part of a model's lifetime
 - Train state-of-the-art models in 3 lines of code
@@ -55,14 +57,22 @@ Choose the right framework for every part of a model's lifetime
 | [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
 | [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
 | [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
+| [Quick tour: pipelines](#quick-tour-of-pipelines) | Using Pipelines: Wrapper around tokenizer and models to use finetuned models |
 | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
+| [Quick tour: Share your models ](#Quick-tour-of-model-sharing) | Upload and share your fine-tuned models with the community |
 | [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
 | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
-| [Documentation](https://huggingface.co/transformers/) | Full API documentation and more |
+| [Documentation][(v2.4.0)](https://huggingface.co/transformers/v2.4.0)[(v2.3.0)](https://huggingface.co/transformers/v2.3.0)[(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |

 ## Installation

-This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+), PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
+This repo is tested on Python 3.5+, PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
+
+You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
+
+Create a virtual environment with the version of Python you're going to use and activate it.
+
+Now, if you want to use 🤗 Transformers, you can install it with pip. If you'd like to play with the examples, you must install it from source.

 ### With pip

@@ -83,24 +93,49 @@ Please refer to [TensorFlow installation page](https://www.tensorflow.org/instal
 When TensorFlow 2.0 and/or PyTorch has been installed, you can install from source by cloning the repository and running:

 ```bash
-pip install [--editable] .
+git clone https://github.com/huggingface/transformers
+cd transformers
+pip install .
 ```

+When you update the repository, you should upgrade the transformers installation and its dependencies as follows:
+
+```bash
+git pull
+pip install --upgrade .
+```
+
+### Run the examples
+
+Examples are included in the repository but are not shipped with the library.
+
+Therefore, in order to run the latest versions of the examples, you need to install from source, as described above.
+
+Look at the [README](https://github.com/huggingface/transformers/blob/master/examples/README.md) for how to run examples.
+
 ### Tests

-A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
-
-These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
+A series of tests are included for the library and for some example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).

 Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.

-You can run the tests from the root of the cloned repository with the commands:
+Here's the easiest way to run tests for the library:

 ```bash
-python -m pytest -sv ./transformers/tests/
-python -m pytest -sv ./examples/
+pip install -e ".[testing]"
+make test
 ```

+and for the examples:
+
+```bash
+pip install -e ".[testing]"
+pip install -r examples/requirements.txt
+make test-examples
+```
+
+For details, refer to the [contributing guide](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#tests).
+
 ### Do you want to run a Transformer model on a mobile device?

 You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
@@ -111,7 +146,7 @@ At some point in the future, you'll be able to seamlessly move from pre-training

 ## Model architectures

-🤗 Transformers currently provides 8 NLU/NLG architectures:
+🤗 Transformers currently provides the following NLU/NLG architectures:

 1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
@@ -120,8 +155,16 @@ At some point in the future, you'll be able to seamlessly move from pre-training
 5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation).
+8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
 9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+11. **[ALBERT](https://github.com/google-research/ALBERT)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+12. **[T5](https://github.com/google-research/text-to-text-transfer-transformer)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+13. **[XLM-RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/xlmr)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+14. **[MMBT](https://github.com/facebookresearch/mmbt/)** (from Facebook), released together with the paper a [Supervised Multimodal Bitransformers for Classifying Images and Text](https://arxiv.org/pdf/1909.02950.pdf) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
+15. **[FlauBERT](https://github.com/getalp/Flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+16. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
+17. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.

 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).

@@ -143,7 +186,7 @@ import torch
 from transformers import *

 # Transformers has a unified API
-# for 8 transformer architectures and 30 pretrained weights.
+# for 10 transformer architectures and 30 pretrained weights.
 #          Model          | Tokenizer          | Pretrained weights shortcut
 MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
@@ -152,8 +195,10 @@ MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
-          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
-          (RobertaModel,    RobertaTokenizer,    'roberta-base')]
+          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'),
+          (RobertaModel,    RobertaTokenizer,    'roberta-base'),
+          (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
+         ]

 # To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`

@@ -170,16 +215,16 @@ for model_class, tokenizer_class, pretrained_weights in MODELS:

 # Each architecture is provided with several class for fine-tuning on down-stream tasks, e.g.
 BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
-                      BertForSequenceClassification, BertForMultipleChoice, BertForTokenClassification,
-                      BertForQuestionAnswering]
+                      BertForSequenceClassification, BertForTokenClassification, BertForQuestionAnswering]

 # All the classes for an architecture can be initiated from pretrained weights for this architecture
 # Note that additional weights added for fine-tuning are only initialized
 # and need to be trained on the down-stream task
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+pretrained_weights = 'bert-base-uncased'
+tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
 for model_class in BERT_MODEL_CLASSES:
    # Load pretrained model/tokenizer
-    model = model_class.from_pretrained('bert-base-uncased')
+    model = model_class.from_pretrained(pretrained_weights)

    # Models can return full list of hidden-states & attentions weights at each layer
    model = model_class.from_pretrained(pretrained_weights,
@@ -221,7 +266,7 @@ valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer,
 train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
 valid_dataset = valid_dataset.batch(64)

-# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
+# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
 optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
 loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
@@ -242,14 +287,20 @@ sentence_2 = "His findings were not compatible with this research."
 inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
 inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')

-pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
-pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
+pred_1 = pytorch_model(inputs_1['input_ids'], token_type_ids=inputs_1['token_type_ids'])[0].argmax().item()
+pred_2 = pytorch_model(inputs_2['input_ids'], token_type_ids=inputs_2['token_type_ids'])[0].argmax().item()
+
 print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
 print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
 ```

 ## Quick tour of the fine-tuning/usage scripts

+**Important**
+Before running the fine-tuning scripts, please read the
+[instructions](#run-the-examples) on how to
+setup your environment to run the examples.
+
 The library comprises several example scripts with SOTA performances for NLU and NLG tasks:

 - `run_glue.py`: an example fine-tuning Bert, XLNet and XLM on nine different GLUE tasks (*sequence-level classification*)
@@ -406,16 +457,96 @@ python ./examples/run_generation.py \
    --model_name_or_path=gpt2 \
 ```

-and from the Salesforce CTRL model: 
+and from the Salesforce CTRL model:
 ```shell
 python ./examples/run_generation.py \
    --model_type=ctrl \
    --length=20 \
-    --model_name_or_path=gpt2 \
+    --model_name_or_path=ctrl \
    --temperature=0 \
    --repetition_penalty=1.2 \
 ```

+## Quick tour of model sharing
+
+Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
+
+**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
+
+```shell
+transformers-cli login
+# log in using the same credentials as on huggingface.co
+```
+Upload your model:
+```shell
+transformers-cli upload ./path/to/pretrained_model/
+
+# ^^ Upload folder containing weights/tokenizer/config
+# saved via `.save_pretrained()`
+
+transformers-cli upload ./config.json [--filename folder/foobar.json]
+
+# ^^ Upload a single file
+# (you can optionally override its filename, which can be nested inside a folder)
+```
+
+Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
+```python
+"username/pretrained_model"
+```
+
+**Please add a README.md model card** to the repo under `model_cards/` with: model description, training params (dataset, preprocessing, hyperparameters), evaluation results, intended uses & limitations, etc.
+
+Your model now has a page on huggingface.co/models 🔥
+
+Anyone can load it from code:
+```python
+tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
+model = AutoModel.from_pretrained("username/pretrained_model")
+```
+
+List all your files on S3:
+```shell
+transformers-cli s3 ls
+```
+
+You can also delete unneeded files:
+
+```shell
+transformers-cli s3 rm …
+```
+
+## Quick tour of pipelines
+
+New in version `v2.3`: `Pipeline` are high-level objects which automatically handle tokenization, running your data through a transformers model
+and outputting the result in a structured object.
+
+You can create `Pipeline` objects for the following down-stream tasks:
+
+ - `feature-extraction`: Generates a tensor representation for the input sequence
+ - `ner`: Generates named entity mapping for each word in the input sequence.
+ - `sentiment-analysis`: Gives the polarity (positive / negative) of the whole input sequence.
+ - `text-classification`: Initialize a `TextClassificationPipeline` directly, or see `sentiment-analysis` for an example.
+ - `question-answering`: Provided some context and a question refering to the context, it will extract the answer to the question in the context.
+ - `fill-mask`: Takes an input sequence containing a masked token (e.g. `<mask>`) and return list of most probable filled sequences, with their probabilities.
+
+```python
+from transformers import pipeline
+
+# Allocate a pipeline for sentiment-analysis
+nlp = pipeline('sentiment-analysis')
+nlp('We are very happy to include pipeline into the transformers repository.')
+>>> {'label': 'POSITIVE', 'score': 0.99893874}
+
+# Allocate a pipeline for question-answering
+nlp = pipeline('question-answering')
+nlp({
+    'question': 'What is the name of the repository ?',
+    'context': 'Pipeline have been included in the huggingface/transformers repository'
+})
+>>> {'score': 0.28756016668193496, 'start': 35, 'end': 59, 'answer': 'huggingface/transformers'}
+```
+
 ## Migrating from pytorch-transformers to transformers

 Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
@@ -518,12 +649,12 @@ Here is a conversion examples from `BertAdam` with a linear warmup and decay sch
 # Parameters:
 lr = 1e-3
 max_grad_norm = 1.0
-num_total_steps = 1000
+num_training_steps = 1000
 num_warmup_steps = 100
-warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1
+warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1

 ### Previously BertAdam optimizer was instantiated like this:
-optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps)
+optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_training_steps)
 ### and used like this:
 for batch in train_data:
    loss = model(batch)
@@ -532,9 +663,10 @@ for batch in train_data:

 ### In Transformers, optimizer and schedules are splitted and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
-scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
+scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler
 ### and used like this:
 for batch in train_data:
+    model.train()
    loss = model(batch)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
@@ -545,4 +677,13 @@ for batch in train_data:

 ## Citation

-At the moment, there is no paper associated with Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
+We now have a paper you can cite for the 🤗 Transformers library:
+```
+@article{Wolf2019HuggingFacesTS,
+  title={HuggingFace's Transformers: State-of-the-art Natural Language Processing},
+  author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R'emi Louf and Morgan Funtowicz and Jamie Brew},
+  journal={ArXiv},
+  year={2019},
+  volume={abs/1910.03771}
+}
+```
--- a/deploy_multi_version_doc.sh
+++ b/deploy_multi_version_doc.sh
@@ -0,0 +1,23 @@
+cd docs
+
+function deploy_doc(){
+	echo "Creating doc at commit $1 and pushing to folder $2"
+	git checkout $1
+	if [ ! -z "$2" ] 
+	then
+		echo "Pushing version" $2
+		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
+	else
+		echo "Pushing master"
+		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
+	fi
+}
+
+deploy_doc "master" 
+deploy_doc "b33a385" v1.0.0
+deploy_doc "fe02e45" v1.1.0
+deploy_doc "89fd345" v1.2.0
+deploy_doc "fc9faa8" v2.0.0
+deploy_doc "3ddce1d" v2.1.1
+deploy_doc "f2f3294" v2.2.0
+deploy_doc "d0f8b9a" v2.3.0
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,25 +1,25 @@
 # Generating the documentation

 To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
-you can install them using:
+you can install them with the following command, at the root of the code repository:

 ```bash
-pip install -r requirements.txt
+pip install -e ".[docs]"
 ```
- 
+
 ## Packages installed

-Here's an overview of all the packages installed. If you ran the previous command installing all packages from 
+Here's an overview of all the packages installed. If you ran the previous command installing all packages from
 `requirements.txt`, you do not need to run the following commands.

-Building it requires the package `sphinx` that you can 
+Building it requires the package `sphinx` that you can
 install using:

 ```bash
 pip install -U sphinx
 ```

-You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by 
+You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by
 [Read The Docs](https://readthedocs.org/). You can install it using the following command:

 ```bash
@@ -34,7 +34,7 @@ pip install recommonmark

 ## Building the documentation

-Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following 
+Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following
 command to generate it:

 ```bash
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,32 +0,0 @@
-alabaster==0.7.12
-Babel==2.7.0
-certifi==2019.6.16
-chardet==3.0.4
-commonmark==0.9.0
-docutils==0.14
-future==0.17.1
-idna==2.8
-imagesize==1.1.0
-Jinja2==2.10.1
-MarkupSafe==1.1.1
-packaging==19.0
-Pygments==2.4.2
-pyparsing==2.4.0
-pytz==2019.1
-recommonmark==0.5.0
-requests==2.22.0
-six==1.12.0
-snowballstemmer==1.9.0
-Sphinx==2.1.2
-sphinx-rtd-theme==0.4.3
-sphinxcontrib-applehelp==1.0.1
-sphinxcontrib-devhelp==1.0.1
-sphinxcontrib-htmlhelp==1.0.2
-sphinxcontrib-jsmath==1.0.1
-sphinxcontrib-qthelp==1.0.2
-sphinxcontrib-serializinghtml==1.1.3
-urllib3==1.25.3
-sphinx-markdown-tables==0.0.9
-numpy==1.17.2
-tensorflow==2.0.0rc2
-torch==1.2.0
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -194,3 +194,41 @@ h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{
    src: url(./Calibre-Thin.otf);
    font-weight:400;
 }
+
+
+/**
+ * Nav Links to other parts of huggingface.co
+ */
+ div.menu {
+    position: absolute;
+    top: 0;
+    right: 0;
+    padding-top: 20px;
+    padding-right: 20px;
+    z-index: 1000;
+}
+div.menu a {
+    font-size: 14px;
+    letter-spacing: 0.3px;
+    text-transform: uppercase;
+    color: white;
+    -webkit-font-smoothing: antialiased;
+    background: linear-gradient(0deg, #6671ffb8, #9a66ffb8 50%);
+    padding: 10px 16px 6px 16px;
+    border-radius: 3px;
+    margin-left: 12px;
+    position: relative;
+}
+div.menu a:active {
+    top: 1px;
+}
+@media (min-width: 768px) and (max-width: 1750px) {
+    .wy-breadcrumbs {
+        margin-top: 32px;
+    }
+}
+@media (max-width: 768px) {
+    div.menu {
+        display: none;
+    }
+}
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,5 +1,5 @@
 function addIcon() {
-    const huggingFaceLogo = "https://huggingface.co/assets/transformers-docs/huggingface_logo.svg";
+    const huggingFaceLogo = "https://huggingface.co/landing/assets/transformers-docs/huggingface_logo.svg";
    const image = document.createElement("img");
    image.setAttribute("src", huggingFaceLogo);

@@ -24,10 +24,10 @@ function addCustomFooter() {
    social.classList.add("footer__Social");

    const imageDetails = [
-        { link: "https://huggingface.co", imageLink: "https://huggingface.co/assets/transformers-docs/website.svg" },
-        { link: "https://twitter.com/huggingface", imageLink: "https://huggingface.co/assets/transformers-docs/twitter.svg" },
-        { link: "https://github.com/huggingface", imageLink: "https://huggingface.co/assets/transformers-docs/github.svg" },
-        { link: "https://www.linkedin.com/company/huggingface/", imageLink: "https://huggingface.co/assets/transformers-docs/linkedin.svg" }
+        { link: "https://huggingface.co", imageLink: "https://huggingface.co/landing/assets/transformers-docs/website.svg" },
+        { link: "https://twitter.com/huggingface", imageLink: "https://huggingface.co/landing/assets/transformers-docs/twitter.svg" },
+        { link: "https://github.com/huggingface", imageLink: "https://huggingface.co/landing/assets/transformers-docs/github.svg" },
+        { link: "https://www.linkedin.com/company/huggingface/", imageLink: "https://huggingface.co/landing/assets/transformers-docs/linkedin.svg" }
    ];

    imageDetails.forEach(imageLinks => {
@@ -58,6 +58,16 @@ function addGithubButton() {
    document.querySelector(".wy-side-nav-search .icon-home").insertAdjacentHTML('afterend', div);
 }

+function addHfMenu() {
+    const div = `
+    <div class="menu">
+        <a href="/welcome">🔥 Sign in</a>
+        <a href="/models">🚀 Models</a>
+    </div>
+    `;
+    document.body.insertAdjacentHTML('afterbegin', div);
+}
+
 /*!
 * github-buttons v2.2.10
 * (c) 2019 なつき
@@ -74,6 +84,7 @@ function onLoad() {
    addCustomFooter();
    addGithubButton();
    parseGithubButtons();
+    addHfMenu();
 }

 window.addEventListener("load", onLoad);
--- a/docs/source/benchmarks.md
+++ b/docs/source/benchmarks.md
@@ -0,0 +1,54 @@
+# Benchmarks
+
+This section is dedicated to the Benchmarks done by the library, both by maintainers, contributors and users. These 
+benchmark will help keep track of the preformance improvements that are brought to our models across versions.
+
+## Benchmarking all models for inference
+
+As of version 2.1 we have benchmarked all models for inference, across many different settings: using PyTorch, with
+and without TorchScript, using TensorFlow, with and without XLA. All of those tests were done across CPUs (except for
+TensorFlow XLA) and GPUs.
+
+The approach is detailed in the [following blogpost](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2)
+
+The results are available [here](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing).
+
+## TF2 with mixed precision, XLA, Distribution (@tlkh)
+
+This work was done by [Timothy Liu](https://github.com/tlkh).
+
+There are very positive results to be gained from the various TensorFlow 2.0 features:
+
+- Automatic Mixed Precision (AMP)
+- XLA compiler
+- Distribution strategies (multi-GPU)
+
+The benefits are listed here (tested on CoLA, MRPC, SST-2):
+
+- AMP: Between 1.4x to 1.6x decrease in overall time without change in batch size
+- AMP+XLA: Up to 2.5x decrease in overall time on SST-2 (larger dataset)
+- Distribution: Between 1.4x to 3.4x decrease in overall time on 4xV100
+- Combined: Up to 5.7x decrease in overall training time, or 9.1x training throughput
+
+The model quality (measured by the validation accuracy) fluctuates slightly. Taking an average of 4 training runs 
+on a single GPU gives the following results:
+
+- CoLA: AMP results in slighter lower acc (0.820 vs 0.824)
+- MRPC: AMP results in lower acc (0.823 vs 0.835)
+- SST-2: AMP results in slighter lower acc (0.918 vs 0.922)
+
+However, in a distributed setting with 4xV100 (4x batch size), AMP can yield in better results:
+
+CoLA: AMP results in higher acc (0.828 vs 0.812)
+MRPC: AMP results in lower acc (0.817 vs 0.827)
+SST-2: AMP results in slightly lower acc (0.926 vs 0.929)
+
+The benchmark script is available [here](https://github.com/NVAITC/benchmarking/blob/master/tf2/bert_dist.py).
+
+Note: on some tasks (e.g. MRPC), the dataset is too small. The overhead due to the model compilation with XLA as well
+as the distribution strategy setup does not speed things up. The XLA compile time is also the reason why although throughput 
+can increase a lot (e.g. 2.7x for single GPU), overall (end-to-end) training speed-up is not as fast (as low as 1.4x)
+
+The benefits as seen on SST-2 (larger dataset) is much clear.
+
+All results can be seen on this [Google Sheet](https://docs.google.com/spreadsheets/d/1538MN224EzjbRL239sqSiUy6YY-rAjHyXhTzz_Zptls/edit#gid=960868445).
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -14,7 +14,7 @@
 #
 import os
 import sys
-sys.path.insert(0, os.path.abspath('../..'))
+sys.path.insert(0, os.path.abspath('../../src'))


 # -- Project information -----------------------------------------------------
@@ -26,7 +26,7 @@ author = u'huggingface'
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'2.1.0'
+release = u'2.5.0'


 # -- General configuration ---------------------------------------------------
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -3,6 +3,12 @@ Converting Tensorflow Checkpoints

 A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models than be loaded using the ``from_pretrained`` methods of the library.

+.. note::
+    Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**)
+    available in any transformers >= 2.3.0 installation.
+
+    The documentation below reflects the **transformers-cli convert** command format.
+
 BERT
 ^^^^

@@ -20,10 +26,10 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas

   export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12

-   transformers bert \
-     $BERT_BASE_DIR/bert_model.ckpt \
-     $BERT_BASE_DIR/bert_config.json \
-     $BERT_BASE_DIR/pytorch_model.bin
+   transformers-cli convert --model_type bert \
+     --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
+     --config $BERT_BASE_DIR/bert_config.json \
+     --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin

 You can download Google's pre-trained models for the conversion `here <https://github.com/google-research/bert#pre-trained-models>`__.

@@ -36,10 +42,12 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,

   export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights

-   transformers gpt \
-     $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
-     $PYTORCH_DUMP_OUTPUT \
-     [OPENAI_GPT_CONFIG]
+   transformers-cli convert --model_type gpt \
+     --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
+     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+     [--config OPENAI_GPT_CONFIG] \
+     [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
+

 OpenAI GPT-2
 ^^^^^^^^^^^^
@@ -50,10 +58,11 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT-2 mode

   export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights

-   transformers gpt2 \
-     $OPENAI_GPT2_CHECKPOINT_PATH \
-     $PYTORCH_DUMP_OUTPUT \
-     [OPENAI_GPT2_CONFIG]
+   transformers-cli convert --model_type gpt2 \
+     --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
+     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+     [--config OPENAI_GPT2_CONFIG] \
+     [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]

 Transformer-XL
 ^^^^^^^^^^^^^^
@@ -64,27 +73,28 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo

   export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint

-   transformers transfo_xl \
-     $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
-     $PYTORCH_DUMP_OUTPUT \
-     [TRANSFO_XL_CONFIG]
+   transformers-cli convert --model_type transfo_xl \
+     --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
+     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+     [--config TRANSFO_XL_CONFIG] \
+     [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]


 XLNet
 ^^^^^

-Here is an example of the conversion process for a pre-trained XLNet model, fine-tuned on STS-B using the TensorFlow script:
+Here is an example of the conversion process for a pre-trained XLNet model:

 .. code-block:: shell

   export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
   export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config

-   transformers xlnet \
-     $TRANSFO_XL_CHECKPOINT_PATH \
-     $TRANSFO_XL_CONFIG_PATH \
-     $PYTORCH_DUMP_OUTPUT \
-     STS-B \
+   transformers-cli convert --model_type xlnet \
+     --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
+     --config $TRANSFO_XL_CONFIG_PATH \
+     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+     [--finetuning_task_name XLNET_FINETUNED_TASK] \


 XLM
@@ -96,6 +106,8 @@ Here is an example of the conversion process for a pre-trained XLM model:

   export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint

-   transformers xlm \
-     $XLM_CHECKPOINT_PATH \
-     $PYTORCH_DUMP_OUTPUT \
+   transformers-cli convert --model_type xlm \
+     --tf_checkpoint $XLM_CHECKPOINT_PATH \
+     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
+    [--config XML_CONFIG] \
+    [--finetuning_task_name XML_FINETUNED_TASK]
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -0,0 +1,145 @@
+Glossary
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Every model is different yet bears similarities with the others. Therefore most models use the same inputs, which are
+detailed here alongside usage examples.
+
+Input IDs
+--------------------------
+
+The input ids are often the only required parameters to be passed to the model as input. *They are token indices,
+numerical representations of tokens building the sequences that will be used as input by the model*.
+
+Each tokenizer works differently but the underlying mechanism remains the same. Here's an example using the BERT
+tokenizer, which is a `WordPiece <https://arxiv.org/pdf/1609.08144.pdf>`__ tokenizer:
+
+::
+
+    from transformers import BertTokenizer
+    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+
+    sequence = "A Titan RTX has 24GB of VRAM"
+
+The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.
+
+::
+
+    # Continuation of the previous script
+    tokenized_sequence = tokenizer.tokenize(sequence)
+    assert tokenized_sequence == ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
+
+These tokens can then be converted into IDs which are understandable by the model. Several methods are available for
+this, the recommended being `encode` or `encode_plus`, which leverage the Rust implementation of
+`huggingface/tokenizers <https://github.com/huggingface/tokenizers>`__ for peak performance.
+
+::
+
+    # Continuation of the previous script
+    encoded_sequence = tokenizer.encode(sequence)
+    assert encoded_sequence == [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
+
+The `encode` and `encode_plus` methods automatically add "special tokens" which are special IDs the model uses.
+
+Attention mask
+--------------------------
+
+The attention mask is an optional argument used when batching sequences together. This argument indicates to the
+model which tokens should be attended to, and which should not.
+
+For example, consider these two sequences:
+
+::
+
+    from transformers import BertTokenizer
+    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+
+    sequence_a = "This is a short sequence."
+    sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
+
+    encoded_sequence_a = tokenizer.encode(sequence_a)
+    assert len(encoded_sequence_a) == 8
+
+    encoded_sequence_b = tokenizer.encode(sequence_b)
+    assert len(encoded_sequence_b) == 19
+
+These two sequences have different lengths and therefore can't be put together in a same tensor as-is. The first
+sequence needs to be padded up to the length of the second one, or the second one needs to be truncated down to
+the length of the first one.
+
+In the first case, the list of IDs will be extended by the padding indices:
+
+::
+
+    # Continuation of the previous script
+    padded_sequence_a = tokenizer.encode(sequence_a, max_length=19, pad_to_max_length=True)
+
+    assert padded_sequence_a == [101, 1188, 1110, 170, 1603, 4954,  119, 102,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,   0]
+    assert encoded_sequence_b == [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]
+
+These can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating
+the position of the padded indices so that the model does not attend to them. For the
+:class:`~transformers.BertTokenizer`, :obj:`1` indicate a value that should be attended to while :obj:`0` indicate
+a padded value.
+
+The method :func:`~transformers.PreTrainedTokenizer.encode_plus` may be used to obtain the attention mask directly:
+
+::
+
+    # Continuation of the previous script
+    sequence_a_dict = tokenizer.encode_plus(sequence_a, max_length=19, pad_to_max_length=True)
+
+    assert sequence_a_dict['input_ids'] == [101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    assert sequence_a_dict['attention_mask'] == [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+
+
+Token Type IDs
+--------------------------
+
+Some models' purpose is to do sequence classification or question answering. These require two different sequences to
+be encoded in the same input IDs. They are usually separated by special tokens, such as the classifier and separator
+tokens. For example, the BERT model builds its two sequence input as such:
+
+::
+
+    from transformers import BertTokenizer
+    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+
+    # [CLS] SEQ_A [SEP] SEQ_B [SEP]
+
+    sequence_a = "HuggingFace is based in NYC"
+    sequence_b = "Where is HuggingFace based?"
+
+    encoded_sequence = tokenizer.encode(sequence_a, sequence_b)
+    assert tokenizer.decode(encoded_sequence) == "[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]"
+
+This is enough for some models to understand where one sequence ends and where another begins. However, other models
+such as BERT have an additional mechanism, which are the segment IDs. The Token Type IDs are a binary mask identifying
+the different sequences in the model.
+
+We can leverage :func:`~transformers.PreTrainedTokenizer.encode_plus` to output the Token Type IDs for us:
+
+::
+
+    # Continuation of the previous script
+    encoded_dict = tokenizer.encode_plus(sequence_a, sequence_b)
+
+    assert encoded_dict['input_ids'] == [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102]
+    assert encoded_dict['token_type_ids'] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+The first sequence, the "context" used for the question, has all its tokens represented by :obj:`0`, whereas the
+question has all its tokens represented by :obj:`1`. Some models, like :class:`~transformers.XLNetModel` use an
+additional token represented by a :obj:`2`.
+
+
+Position IDs
+--------------------------
+
+The position IDs are used by the model to identify which token is at which position. Contrary to RNNs that have the
+position of each token embedded within them, transformers are unaware of the position of each token. The position
+IDs are created for this purpose.
+
+They are an optional parameter. If no position IDs are passed to the model, they are automatically created as absolute
+positional embeddings.
+
+Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models
+use other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -47,6 +47,11 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
 6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
 7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
+9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
+11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+12. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+13. `FlauBERT <https://github.com/getalp/Flaubert>`_ (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.

 .. toctree::
    :maxdepth: 2
@@ -54,7 +59,9 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train

    installation
    quickstart
+    glossary
    pretrained_models
+    model_sharing
    examples
    notebooks
    serialization
@@ -63,6 +70,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
    bertology
    torchscript
    multilingual
+    benchmarks

 .. toctree::
    :maxdepth: 2
@@ -88,3 +96,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
    model_doc/roberta
    model_doc/distilbert
    model_doc/ctrl
+    model_doc/camembert
+    model_doc/albert
+    model_doc/xlmroberta
+    model_doc/flaubert
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -1,6 +1,6 @@
 # Installation

-Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
+Transformers is tested on Python 3.5+ and PyTorch 1.1.0

 ## With pip

@@ -17,25 +17,18 @@ To install from source, clone the repository and install with:
 ``` bash
 git clone https://github.com/huggingface/transformers.git
 cd transformers
-pip install [--editable] .
+pip install .
 ```

 ## Tests

-An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).

-Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
-
-Run all the tests from the root of the cloned repository with the commands:
-
-``` bash
-python -m pytest -sv ./transformers/tests/
-python -m pytest -sv ./examples/
-```
+Refer to the [contributing guide](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#tests) for details about running tests.

 ## OpenAI GPT original tokenization workflow

-If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`:
+If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` and `SpaCy`:

 ``` bash
 pip install spacy ftfy==4.4.3
--- a/docs/source/main_classes/optimizer_schedules.rst
+++ b/docs/source/main_classes/optimizer_schedules.rst
@@ -5,6 +5,7 @@ The ``.optimization`` module provides:

 - an optimizer with weight decay fixed that can be used to fine-tuned models, and
 - several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
+- a gradient accumulation class to accumulate the gradients of multiple batches

 ``AdamW``
 ~~~~~~~~~~~~~~~~
@@ -12,34 +13,37 @@ The ``.optimization`` module provides:
 .. autoclass:: transformers.AdamW
    :members:

+``AdamWeightDecay``
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AdamWeightDecay
+    :members:
+
+.. autofunction:: transformers.create_optimizer
+
 Schedules
 ----------------------------------------------------

 Learning Rate Schedules
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. autoclass:: transformers.ConstantLRSchedule
-    :members:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autofunction:: transformers.get_constant_schedule


-.. autoclass:: transformers.WarmupConstantSchedule
-    :members:
+.. autofunction:: transformers.get_constant_schedule_with_warmup

 .. image:: /imgs/warmup_constant_schedule.png
    :target: /imgs/warmup_constant_schedule.png
    :alt:


-.. autoclass:: transformers.WarmupCosineSchedule
-    :members:
+.. autofunction:: transformers.get_cosine_schedule_with_warmup

 .. image:: /imgs/warmup_cosine_schedule.png
    :target: /imgs/warmup_cosine_schedule.png
    :alt:


-.. autoclass:: transformers.WarmupCosineWithHardRestartsSchedule
-    :members:
+.. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup

 .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
    :target: /imgs/warmup_cosine_hard_restarts_schedule.png
@@ -47,9 +51,22 @@ Learning Rate Schedules



-.. autoclass:: transformers.WarmupLinearSchedule
-    :members:
+.. autofunction:: transformers.get_linear_schedule_with_warmup

 .. image:: /imgs/warmup_linear_schedule.png
    :target: /imgs/warmup_linear_schedule.png
    :alt:
+
+``Warmup``
+~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.WarmUp
+    :members:
+
+Gradient Strategies
+----------------------------------------------------
+
+``GradientAccumulator``
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GradientAccumulator
--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@@ -54,5 +54,100 @@ Additionally, the following method  can be used to load values from a data file
 Example usage
 ^^^^^^^^^^^^^^^^^^^^^^^^^

+An example using these processors is given in the `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
+
+
+XNLI
+~~~~~~~~~~~~~~~~~~~~~
+
+`The Cross-Lingual NLI Corpus (XNLI) <https://www.nyu.edu/projects/bowman/xnli/>`__ is a benchmark that evaluates
+the quality of cross-lingual text representations. 
+XNLI is crowd-sourced dataset based on `MultiNLI <http://www.nyu.edu/projects/bowman/multinli/>`: pairs of text are labeled with textual entailment 
+annotations for 15 different languages (including both high-resource language such as English and low-resource languages such as Swahili).
+
+It was released together with the paper
+`XNLI: Evaluating Cross-lingual Sentence Representations <https://arxiv.org/abs/1809.05053>`__
+
+This library hosts the processor to load the XNLI data:
+    - :class:`~transformers.data.processors.utils.XnliProcessor`
+
+Please note that since the gold labels are available on the test set, evaluation is performed on the test set.
+
 An example using these processors is given in the
-`run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
+`run_xnli.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_xnli.py>`__ script.
+
+
+SQuAD
+~~~~~~~~~~~~~~~~~~~~~
+
+`The Stanford Question Answering Dataset (SQuAD) <https://rajpurkar.github.io/SQuAD-explorer//>`__ is a benchmark that evaluates
+the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version (v1.1) was released together with the paper
+`SQuAD: 100,000+ Questions for Machine Comprehension of Text <https://arxiv.org/abs/1606.05250>`__. The second version (v2.0) was released alongside 
+the paper `Know What You Don't Know: Unanswerable Questions for SQuAD <https://arxiv.org/abs/1806.03822>`__.
+
+This library hosts a processor for each of the two versions:
+
+Processors
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Those processors are:
+    - :class:`~transformers.data.processors.utils.SquadV1Processor`
+    - :class:`~transformers.data.processors.utils.SquadV2Processor`
+
+They both inherit from the abstract class :class:`~transformers.data.processors.utils.SquadProcessor`
+
+.. autoclass:: transformers.data.processors.squad.SquadProcessor
+    :members:
+
+Additionally, the following method can be used to convert SQuAD examples into :class:`~transformers.data.processors.utils.SquadFeatures`
+that can be used as model inputs.
+
+.. automethod:: transformers.data.processors.squad.squad_convert_examples_to_features
+
+These processors as well as the aforementionned method can be used with files containing the data as well as with the `tensorflow_datasets` package.
+Examples are given below.
+
+
+Example usage
+^^^^^^^^^^^^^^^^^^^^^^^^^
+Here is an example using the processors as well as the conversion method using data files:
+
+Example::
+
+    # Loading a V2 processor
+    processor = SquadV2Processor()
+    examples = processor.get_dev_examples(squad_v2_data_dir)
+
+    # Loading a V1 processor
+    processor = SquadV1Processor()
+    examples = processor.get_dev_examples(squad_v1_data_dir)
+
+    features = squad_convert_examples_to_features( 
+        examples=examples,
+        tokenizer=tokenizer,
+        max_seq_length=max_seq_length,
+        doc_stride=args.doc_stride,
+        max_query_length=max_query_length,
+        is_training=not evaluate,
+    )
+
+Using `tensorflow_datasets` is as easy as using a data file:
+
+Example::
+
+    # tensorflow_datasets only handle Squad V1.
+    tfds_examples = tfds.load("squad")
+    examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
+
+    features = squad_convert_examples_to_features( 
+        examples=examples,
+        tokenizer=tokenizer,
+        max_seq_length=max_seq_length,
+        doc_stride=args.doc_stride,
+        max_query_length=max_query_length,
+        is_training=not evaluate,
+    )
+
+
+Another example using these processors is given in the
+`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/run_squad.py>`__ script.
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -84,12 +84,12 @@ Here is a conversion examples from `BertAdam` with a linear warmup and decay sch
 # Parameters:
 lr = 1e-3
 max_grad_norm = 1.0
-num_total_steps = 1000
+num_training_steps = 1000
 num_warmup_steps = 100
-warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1
+warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1

 ### Previously BertAdam optimizer was instantiated like this:
-optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps)
+optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, num_training_steps=num_training_steps)
 ### and used like this:
 for batch in train_data:
    loss = model(batch)
@@ -98,12 +98,12 @@ for batch in train_data:

 ### In Transformers, optimizer and schedules are splitted and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
-scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
+scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler
 ### and used like this:
 for batch in train_data:
    loss = model(batch)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
-    scheduler.step()
    optimizer.step()
+    scheduler.step()
 ```
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@@ -0,0 +1,93 @@
+ALBERT
+----------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+The ALBERT model was proposed in `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_
+by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
+two parameter-reduction techniques to lower memory consumption and increase the trainig speed of BERT:
+
+- Splitting the embedding matrix into two smaller matrices
+- Using repeating layers split among groups
+
+The abstract from the paper is the following:
+
+*Increasing model size when pretraining natural language representations often results in improved performance on
+downstream tasks. However, at some point further model increases become harder due to GPU/TPU memory limitations,
+longer training times, and unexpected model degradation. To address these problems, we present two parameter-reduction
+techniques to lower memory consumption and increase the training speed of BERT. Comprehensive empirical evidence shows
+that our proposed methods lead to models that scale much better compared to the original BERT. We also use a
+self-supervised loss that focuses on modeling inter-sentence coherence, and show it consistently helps downstream
+tasks with multi-sentence inputs. As a result, our best model establishes new state-of-the-art results on the GLUE,
+RACE, and SQuAD benchmarks while having fewer parameters compared to BERT-large.*
+
+Tips:
+
+- ALBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
+  the right rather than the left.
+- ALBERT uses repeating layers which results in a small memory footprint, however the computational cost remains
+  similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same
+  number of (repeating) layers.
+
+AlbertConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertConfig
+    :members:
+
+
+AlbertTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertTokenizer
+    :members:
+
+
+AlbertModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertModel
+    :members:
+
+
+AlbertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertForMaskedLM
+    :members:
+
+
+AlbertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertForSequenceClassification
+    :members:
+
+
+AlbertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertForQuestionAnswering
+    :members:
+
+
+TFAlbertModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertModel
+    :members:
+
+
+TFAlbertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForMaskedLM
+    :members:
+
+
+TFAlbertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForSequenceClassification
+    :members:
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -3,7 +3,7 @@ AutoModels

 In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method.

-AutoClasses are here to do this job for you so that you automatically retreive the relevant model given the name/path to the pretrained weights/config/vocabulary:
+AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary:

 Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``).

@@ -15,6 +15,13 @@ Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will di
    :members:


+``AutoTokenizer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoTokenizer
+    :members:
+
+
 ``AutoModel``
 ~~~~~~~~~~~~~~~~~~~~~

@@ -22,8 +29,37 @@ Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will di
    :members:


-``AutoTokenizer``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+``AutoModelForPreTraining``
+~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.AutoTokenizer
+.. autoclass:: transformers.AutoModelForPreTraining
    :members:
+
+
+``AutoModelWithLMHead``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelWithLMHead
+    :members:
+
+
+``AutoModelForSequenceClassification``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForSequenceClassification
+    :members:
+
+
+``AutoModelForQuestionAnswering``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForQuestionAnswering
+    :members:
+
+
+``AutoModelForTokenClassification``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForTokenClassification
+    :members:
+
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -1,126 +1,160 @@
 BERT
 ----------------------------------------------------

-``BertConfig``
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+The BERT model was proposed in `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__
+by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
+pre-trained using a combination of masked language modeling objective and next sentence prediction
+on a large corpus comprising the Toronto Book Corpus and Wikipedia.
+
+The abstract from the paper is the following:
+
+*We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations
+from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional
+representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result,
+the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models
+for a wide range of tasks, such as question answering and language inference, without substantial task-specific
+architecture modifications.*
+
+*BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural
+language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI
+accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute
+improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).*
+
+Tips:
+
+- BERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
+  the right rather than the left.
+- BERT was trained with a masked language modeling (MLM) objective. It is therefore efficient at predicting masked
+  tokens and at NLU in general, but is not optimal for text generation. Models trained with a causal language
+  modeling (CLM) objective are better in that regard.
+- Alongside MLM, BERT was trained using a next sentence prediction (NSP) objective using the [CLS] token as a sequence
+  approximate. The user may use this token (the first token in a sequence built with special tokens) to get a sequence
+  prediction rather than a token prediction. However, averaging over the sequence may yield better results than using
+  the [CLS] token.
+
+BertConfig
 ~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertConfig
    :members:


-``BertTokenizer``
+BertTokenizer
 ~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertTokenizer
    :members:


-``BertModel``
+BertModel
 ~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertModel
    :members:


-``BertForPreTraining``
+BertForPreTraining
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertForPreTraining
    :members:


-``BertForMaskedLM``
+BertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertForMaskedLM
    :members:


-``BertForNextSentencePrediction``
+BertForNextSentencePrediction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertForNextSentencePrediction
    :members:


-``BertForSequenceClassification``
+BertForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertForSequenceClassification
    :members:


-``BertForMultipleChoice``
+BertForMultipleChoice
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertForMultipleChoice
    :members:


-``BertForTokenClassification``
+BertForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertForTokenClassification
    :members:


-``BertForQuestionAnswering``
+BertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertForQuestionAnswering
    :members:


-``TFBertModel``
+TFBertModel
 ~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFBertModel
    :members:


-``TFBertForPreTraining``
+TFBertForPreTraining
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFBertForPreTraining
    :members:


-``TFBertForMaskedLM``
+TFBertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFBertForMaskedLM
    :members:


-``TFBertForNextSentencePrediction``
+TFBertForNextSentencePrediction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFBertForNextSentencePrediction
    :members:


-``TFBertForSequenceClassification``
+TFBertForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFBertForSequenceClassification
    :members:


-``TFBertForMultipleChoice``
+TFBertForMultipleChoice
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFBertForMultipleChoice
    :members:


-``TFBertForTokenClassification``
+TFBertForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFBertForTokenClassification
    :members:


-``TFBertForQuestionAnswering``
+TFBertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFBertForQuestionAnswering
--- a/docs/source/model_doc/camembert.rst
+++ b/docs/source/model_doc/camembert.rst
@@ -0,0 +1,99 @@
+CamemBERT
+----------------------------------------------------
+
+The CamemBERT model was proposed in `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`__
+by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la
+Clergerie, Djamé Seddah, and Benoît Sagot. It is based on Facebook's RoBERTa model released in 2019. It is a model
+trained on 138GB of French text.
+
+The abstract from the paper is the following:
+
+*Pretrained language models are now ubiquitous in Natural Language Processing. Despite their success,
+most available models have either been trained on English data or on the concatenation of data in multiple
+languages. This makes practical use of such models --in all languages except English-- very limited. Aiming
+to address this issue for French, we release CamemBERT, a French version of the Bi-directional Encoders for
+Transformers (BERT). We measure the performance of CamemBERT compared to multilingual models in multiple
+downstream tasks, namely part-of-speech tagging, dependency parsing, named-entity recognition, and natural
+language inference. CamemBERT improves the state of the art for most of the tasks considered. We release the
+pretrained model for CamemBERT hoping to foster research and downstream applications for French NLP.*
+
+Tips:
+
+- This implementation is the same as RoBERTa. Refer to the `documentation of RoBERTa <./roberta.html>`__ for usage
+  examples as well as the information relative to the inputs and outputs.
+
+CamembertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertConfig
+    :members:
+
+
+CamembertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertTokenizer
+    :members:
+
+
+CamembertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertModel
+    :members:
+
+
+CamembertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertForMaskedLM
+    :members:
+
+
+CamembertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertForSequenceClassification
+    :members:
+
+
+CamembertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertForMultipleChoice
+    :members:
+
+
+CamembertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertForTokenClassification
+    :members:
+
+
+TFCamembertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCamembertModel
+    :members:
+
+
+TFCamembertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCamembertForMaskedLM
+    :members:
+
+
+TFCamembertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCamembertForSequenceClassification
+    :members:
+
+
+TFCamembertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCamembertForTokenClassification
+    :members:
--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@@ -1,42 +1,73 @@
 CTRL
 ----------------------------------------------------

-``CTRLConfig``
+CTRL model was proposed in `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`_
+by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
+corpus of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
+
+The abstract from the paper is the following:
+
+*Large-scale language models show promising text generation capabilities, but users cannot easily control particular
+aspects of the generated text. We release CTRL, a 1.63 billion-parameter conditional transformer language model,
+trained to condition on control codes that govern style, content, and task-specific behavior. Control codes were
+derived from structure that naturally co-occurs with raw text, preserving the advantages of unsupervised learning
+while providing more explicit control over text generation. These codes also allow CTRL to predict which parts of
+the training data are most likely given a sequence. This provides a potential method for analyzing large amounts
+of data via model-based source attribution.*
+
+Tips:
+
+- CTRL makes use of control codes to generate text: it requires generations to be started by certain words, sentences
+  or links to generate coherent text. Refer to the `original implementation <https://github.com/salesforce/ctrl>`__
+  for more information.
+- CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on
+  the right rather than the left.
+- CTRL was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
+  token in a sequence. Leveraging this feature allows CTRL to generate syntactically coherent text as
+  it can be observed in the `run_generation.py` example script.
+- The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
+  this `past` value prevents the model from re-computing pre-computed values in the context of text generation.
+  See `reusing the past in generative models <../quickstart.html#using-the-past>`_ for more information on the usage
+  of this argument.
+
+
+CTRLConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.CTRLConfig
    :members:


-``CTRLTokenizer``
+CTRLTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.CTRLTokenizer
    :members:


-``CTRLModel``
+CTRLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.CTRLModel
    :members:


-``CTRLLMHeadModel``
+CTRLLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.CTRLLMHeadModel
    :members:


-``TFCTRLModel``
+TFCTRLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFCTRLModel
    :members:


-``TFCTRLLMHeadModel``
+TFCTRLLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFCTRLLMHeadModel
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -1,69 +1,96 @@
 DistilBERT
 ----------------------------------------------------

-``DistilBertConfig``
+The DistilBERT model was proposed in the blog post
+`Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`__,
+and the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__.
+DistilBERT is a small, fast, cheap and light Transformer model trained by distilling Bert base. It has 40% less
+parameters than `bert-base-uncased`, runs 60% faster while preserving over 95% of Bert's performances as measured on
+the GLUE language understanding benchmark.
+
+The abstract from the paper is the following:
+
+*As Transfer Learning from large-scale pre-trained models becomes more prevalent in Natural Language Processing (NLP),
+operating these large models in on-the-edge and/or under constrained computational training or inference budgets
+remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation
+model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger
+counterparts. While most prior work investigated the use of distillation for building task-specific models, we
+leverage knowledge distillation during the pre-training phase and show that it is possible to reduce the size of a
+BERT model by 40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage
+the inductive biases learned by larger models during pre-training, we introduce a triple loss combining language
+modeling, distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train
+and we demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative
+on-device study.*
+
+Tips:
+
+- DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
+- DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.
+
+
+DistilBertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DistilBertConfig
    :members:


-``DistilBertTokenizer``
+DistilBertTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DistilBertTokenizer
    :members:


-``DistilBertModel``
+DistilBertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DistilBertModel
    :members:


-``DistilBertForMaskedLM``
+DistilBertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DistilBertForMaskedLM
    :members:


-``DistilBertForSequenceClassification``
+DistilBertForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DistilBertForSequenceClassification
    :members:


-``DistilBertForQuestionAnswering``
+DistilBertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DistilBertForQuestionAnswering
    :members:

-``TFDistilBertModel``
+TFDistilBertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFDistilBertModel
    :members:


-``TFDistilBertForMaskedLM``
+TFDistilBertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFDistilBertForMaskedLM
    :members:


-``TFDistilBertForSequenceClassification``
+TFDistilBertForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFDistilBertForSequenceClassification
    :members:


-``TFDistilBertForQuestionAnswering``
+TFDistilBertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFDistilBertForQuestionAnswering
--- a/docs/source/model_doc/flaubert.rst
+++ b/docs/source/model_doc/flaubert.rst
@@ -0,0 +1,72 @@
+FlauBERT
+----------------------------------------------------
+
+The FlauBERT model was proposed in the paper
+`FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le et al.
+It's a transformer pre-trained using a masked language modeling (MLM) objective (BERT-like).
+
+The abstract from the paper is the following:
+
+*Language models have become a key step to achieve state-of-the art results in many different Natural Language
+Processing (NLP) tasks. Leveraging the huge amount of unlabeled texts nowadays available, they provide an efficient
+way to pre-train continuous word representations that can be fine-tuned for a downstream task, along with their
+contextualization at the sentence level. This has been widely demonstrated for English using contextualized
+representations (Dai and Le, 2015; Peters et al., 2018; Howard and Ruder, 2018; Radford et al., 2018; Devlin et
+al., 2019; Yang et al., 2019b). In this paper, we introduce and share FlauBERT, a model learned on a very large
+and heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre
+for Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text
+classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most
+of the time they outperform other pre-training approaches. Different versions of FlauBERT as well as a unified
+evaluation protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared
+to the research community for further reproducible experiments in French NLP.*
+
+
+FlaubertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertConfig
+    :members:
+
+
+FlaubertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertTokenizer
+    :members:
+
+
+FlaubertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertModel
+    :members:
+
+
+FlaubertWithLMHeadModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertWithLMHeadModel
+    :members:
+
+
+FlaubertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertForSequenceClassification
+    :members:
+
+
+FlaubertForQuestionAnsweringSimple
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertForQuestionAnsweringSimple
+    :members:
+
+
+FlaubertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertForQuestionAnswering
+    :members:
+
+
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -1,56 +1,91 @@
 OpenAI GPT
 ----------------------------------------------------

-``OpenAIGPTConfig``
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+OpenAI GPT model was proposed in `Improving Language Understanding by Generative Pre-Training <https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf>`__
+by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. It's a causal (unidirectional)
+transformer pre-trained using language modeling on a large corpus will long range dependencies, the Toronto Book Corpus.
+
+The abstract from the paper is the following:
+
+*Natural language understanding comprises a wide range of diverse tasks such
+as textual entailment, question answering, semantic similarity assessment, and
+document classification. Although large unlabeled text corpora are abundant,
+labeled data for learning these specific tasks is scarce, making it challenging for
+discriminatively trained models to perform adequately. We demonstrate that large
+gains on these tasks can be realized by generative pre-training of a language model
+on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each
+specific task. In contrast to previous approaches, we make use of task-aware input
+transformations during fine-tuning to achieve effective transfer while requiring
+minimal changes to the model architecture. We demonstrate the effectiveness of
+our approach on a wide range of benchmarks for natural language understanding.
+Our general task-agnostic model outperforms discriminatively trained models that
+use architectures specifically crafted for each task, significantly improving upon the
+state of the art in 9 out of the 12 tasks studied.*
+
+Tips:
+
+- GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
+  the right rather than the left.
+- GPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
+  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as
+  it can be observed in the `run_generation.py` example script.
+
+`Write With Transformer <https://transformer.huggingface.co/doc/gpt>`__ is a webapp created and hosted by
+Hugging Face showcasing the generative capabilities of several models. GPT is one of them.
+
+OpenAIGPTConfig
 ~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.OpenAIGPTConfig
    :members:


-``OpenAIGPTTokenizer``
+OpenAIGPTTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.OpenAIGPTTokenizer
    :members:


-``OpenAIGPTModel``
+OpenAIGPTModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.OpenAIGPTModel
    :members:


-``OpenAIGPTLMHeadModel``
+OpenAIGPTLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.OpenAIGPTLMHeadModel
    :members:


-``OpenAIGPTDoubleHeadsModel``
+OpenAIGPTDoubleHeadsModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.OpenAIGPTDoubleHeadsModel
    :members:


-``TFOpenAIGPTModel``
+TFOpenAIGPTModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFOpenAIGPTModel
    :members:


-``TFOpenAIGPTLMHeadModel``
+TFOpenAIGPTLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFOpenAIGPTLMHeadModel
    :members:


-``TFOpenAIGPTDoubleHeadsModel``
+TFOpenAIGPTDoubleHeadsModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFOpenAIGPTDoubleHeadsModel
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -1,56 +1,90 @@
 OpenAI GPT2
 ----------------------------------------------------

-``GPT2Config``
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+OpenAI GPT-2 model was proposed in
+`Language Models are Unsupervised Multitask Learners`_
+by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+It's a causal (unidirectional) transformer pre-trained using  language modeling on a very large
+corpus of ~40 GB of text data.
+
+The abstract from the paper is the following:
+
+*GPT-2 is a large transformer-based language model with 1.5 billion parameters, trained on a dataset[1]
+of 8 million web pages. GPT-2 is trained with a simple objective: predict the next word, given all of the previous
+words within some text. The diversity of the dataset causes this simple goal to contain naturally occurring
+demonstrations of many tasks across diverse domains. GPT-2 is a direct scale-up of GPT, with more than 10X
+the parameters and trained on more than 10X the amount of data.*
+
+Tips:
+
+- GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
+  the right rather than the left.
+- GPT-2 was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
+  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as
+  it can be observed in the `run_generation.py` example script.
+- The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
+  this `past` value prevents the model from re-computing pre-computed values in the context of text generation.
+  See `reusing the past in generative models <../quickstart.html#using-the-past>`_ for more information on the usage
+  of this argument.
+
+`Write With Transformer <https://transformer.huggingface.co/doc/gpt2-large>`__ is a webapp created and hosted by
+Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
+different sizes: small, medium, large, xl and a distilled version of the small checkpoint: distilgpt-2.
+
+
+GPT2Config
 ~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2Config
    :members:


-``GPT2Tokenizer``
+GPT2Tokenizer
 ~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2Tokenizer
    :members:


-``GPT2Model``
+GPT2Model
 ~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2Model
    :members:


-``GPT2LMHeadModel``
+GPT2LMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2LMHeadModel
    :members:


-``GPT2DoubleHeadsModel``
+GPT2DoubleHeadsModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2DoubleHeadsModel
    :members:


-``TFGPT2Model``
+TFGPT2Model
 ~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFGPT2Model
    :members:


-``TFGPT2LMHeadModel``
+TFGPT2LMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFGPT2LMHeadModel
    :members:


-``TFGPT2DoubleHeadsModel``
+TFGPT2DoubleHeadsModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFGPT2DoubleHeadsModel
--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@@ -1,57 +1,97 @@
 RoBERTa
 ----------------------------------------------------

-``RobertaConfig``
+The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_
+by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
+Veselin Stoyanov. It is based on Google's BERT model released in 2018.
+
+It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining
+objective and training with much larger mini-batches and learning rates.
+
+The abstract from the paper is the following:
+
+*Language model pretraining has led to significant performance gains but careful comparison between different
+approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes,
+and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication
+study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and
+training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of
+every model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These
+results highlight the importance of previously overlooked design choices, and raise questions about the source
+of recently reported improvements. We release our models and code.*
+
+Tips:
+
+- This implementation is the same as :class:`~transformers.BertModel` with a tiny embeddings tweak as well as a
+  setup for Roberta pretrained models.
+- RoBERTa has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a
+  different pre-training scheme.
+- RoBERTa doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `</s>`)
+- `Camembert <./camembert.html>`__ is a wrapper around RoBERTa. Refer to this page for usage examples.
+
+RobertaConfig
 ~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RobertaConfig
    :members:


-``RobertaTokenizer``
+RobertaTokenizer
 ~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RobertaTokenizer
    :members:


-``RobertaModel``
+RobertaModel
 ~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RobertaModel
    :members:


-``RobertaForMaskedLM``
+RobertaForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RobertaForMaskedLM
    :members:


-``RobertaForSequenceClassification``
+RobertaForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RobertaForSequenceClassification
    :members:


-``TFRobertaModel``
+RobertaForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RobertaForTokenClassification
+    :members:
+
+TFRobertaModel
 ~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFRobertaModel
    :members:


-``TFRobertaForMaskedLM``
+TFRobertaForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFRobertaForMaskedLM
    :members:


-``TFRobertaForSequenceClassification``
+TFRobertaForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFRobertaForSequenceClassification
    :members:
+
+
+TFRobertaForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRobertaForTokenClassification
+    :members:
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -1,43 +1,72 @@
 Transformer XL
 ----------------------------------------------------

+Overview
+~~~~~~~~~~~~~~~~~~~~~

-``TransfoXLConfig``
+The Transformer-XL model was proposed in
+`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__
+by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse
+previously computed hidden-states to attend to longer context (memory).
+This model also uses adaptive softmax inputs and outputs (tied).
+
+The abstract from the paper is the following:
+
+*Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the
+setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency
+beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and
+a novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves
+the context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and
+450% longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up
+to 1,800+ times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results
+of bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on
+Penn Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably
+coherent, novel text articles with thousands of tokens.*
+
+Tips:
+
+- Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right.
+  The original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
+- Transformer-XL is one of the few models that has no sequence length limit.
+
+
+TransfoXLConfig
 ~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TransfoXLConfig
    :members:


-``TransfoXLTokenizer``
+TransfoXLTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TransfoXLTokenizer
    :members:


-``TransfoXLModel``
+TransfoXLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TransfoXLModel
    :members:


-``TransfoXLLMHeadModel``
+TransfoXLLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TransfoXLLMHeadModel
    :members:


-``TFTransfoXLModel``
+TFTransfoXLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFTransfoXLModel
    :members:


-``TFTransfoXLLMHeadModel``
+TFTransfoXLLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFTransfoXLLMHeadModel
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -1,68 +1,105 @@
 XLM
 ----------------------------------------------------

-``XLMConfig``
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+The XLM model was proposed in `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_
+by Guillaume Lample*, Alexis Conneau*. It's a transformer pre-trained using one of the following objectives:
+
+- a causal language modeling (CLM) objective (next token prediction),
+- a masked language modeling (MLM) objective (Bert-like), or
+- a Translation Language Modeling (TLM) object (extension of Bert's MLM to multiple language inputs)
+
+The abstract from the paper is the following:
+
+*Recent studies have demonstrated the efficiency of generative pretraining for English natural language understanding.
+In this work, we extend this approach to multiple languages and show the effectiveness of cross-lingual pretraining.
+We propose two methods to learn cross-lingual language models (XLMs): one unsupervised that only relies on monolingual
+data, and one supervised that leverages parallel data with a new cross-lingual language model objective. We obtain
+state-of-the-art results on cross-lingual classification, unsupervised and supervised machine translation. On XNLI,
+our approach pushes the state of the art by an absolute gain of 4.9% accuracy. On unsupervised machine translation,
+we obtain 34.3 BLEU on WMT'16 German-English, improving the previous state of the art by more than 9 BLEU. On
+supervised machine translation, we obtain a new state of the art of 38.5 BLEU on WMT'16 Romanian-English, outperforming
+the previous best approach by more than 4 BLEU. Our code and pretrained models will be made publicly available.*
+
+Tips:
+
+- XLM has many different checkpoints, which were trained using different objectives: CLM, MLM or TLM. Make sure to
+  select the correct objective for your task (e.g. MLM checkpoints are not suitable for generation).
+- XLM has multilingual checkpoints which leverage a specific `lang` parameter. Check out the
+  `multi-lingual <../multilingual.html>`__ page for more information.
+
+
+XLMConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.XLMConfig
    :members:

-``XLMTokenizer``
+XLMTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.XLMTokenizer
    :members:

-``XLMModel``
+XLMModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.XLMModel
    :members:


-``XLMWithLMHeadModel``
+XLMWithLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.XLMWithLMHeadModel
    :members:


-``XLMForSequenceClassification``
+XLMForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.XLMForSequenceClassification
    :members:


-``XLMForQuestionAnswering``
+XLMForQuestionAnsweringSimple
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMForQuestionAnsweringSimple
+    :members:
+
+
+XLMForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.XLMForQuestionAnswering
    :members:


-``TFXLMModel``
+TFXLMModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFXLMModel
    :members:


-``TFXLMWithLMHeadModel``
+TFXLMWithLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFXLMWithLMHeadModel
    :members:


-``TFXLMForSequenceClassification``
+TFXLMForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFXLMForSequenceClassification
    :members:


-``TFXLMForQuestionAnsweringSimple``
+TFXLMForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFXLMForQuestionAnsweringSimple
--- a/docs/source/model_doc/xlmroberta.rst
+++ b/docs/source/model_doc/xlmroberta.rst
@@ -0,0 +1,105 @@
+XLM-RoBERTa
+------------------------------------------
+
+The XLM-RoBERTa model was proposed in `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__
+by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán,
+Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's RoBERTa model released in 2019.
+It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data.
+
+The abstract from the paper is the following:
+
+*This paper shows that pretraining multilingual language models at scale leads to significant performance gains for
+a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred
+languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly
+outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +13.8% average accuracy
+on XNLI, +12.3% average F1 score on MLQA, and +2.1% average F1 score on NER. XLM-R performs particularly well on
+low-resource languages, improving 11.8% in XNLI accuracy for Swahili and 9.2% for Urdu over the previous XLM model.
+We also present a detailed empirical evaluation of the key factors that are required to achieve these gains,
+including the trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and
+low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling
+without sacrificing per-language performance; XLM-Ris very competitive with strong monolingual models on the GLUE
+and XNLI benchmarks. We will make XLM-R code, data, and models publicly available.*
+
+Tips:
+
+- XLM-R is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does
+  not require `lang` tensors to understand which language is used, and should be able to determine the correct
+  language from the input ids.
+- This implementation is the same as RoBERTa. Refer to the `documentation of RoBERTa <./roberta.html>`__ for usage
+  examples as well as the information relative to the inputs and outputs.
+
+XLMRobertaConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMRobertaConfig
+    :members:
+
+
+XLMRobertaTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMRobertaTokenizer
+    :members:
+
+
+XLMRobertaModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMRobertaModel
+    :members:
+
+
+XLMRobertaForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMRobertaForMaskedLM
+    :members:
+
+
+XLMRobertaForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMRobertaForSequenceClassification
+    :members:
+
+
+XLMRobertaForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMRobertaForMultipleChoice
+    :members:
+
+
+XLMRobertaForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMRobertaForTokenClassification
+    :members:
+
+
+TFXLMRobertaModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMRobertaModel
+    :members:
+
+
+TFXLMRobertaForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMRobertaForMaskedLM
+    :members:
+
+
+TFXLMRobertaForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMRobertaForSequenceClassification
+    :members:
+
+
+TFXLMRobertaForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMRobertaForTokenClassification
+    :members:
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -1,70 +1,123 @@
 XLNet
 ----------------------------------------------------

-``XLNetConfig``
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+The XLNet model was proposed in `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_
+by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method
+to learn bidirectional contexts by maximizing the expected likelihood over all permutations
+of the input sequence factorization order.
+
+The abstract from the paper is the following:
+
+*With the capability of modeling bidirectional contexts, denoising autoencoding based pretraining like BERT achieves
+better performance than pretraining approaches based on autoregressive language modeling. However, relying on
+corrupting the input with masks, BERT neglects dependency between the masked positions and suffers from a
+pretrain-finetune discrepancy. In light of these pros and cons, we propose XLNet, a generalized autoregressive
+pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over
+all permutations of the factorization order and (2) overcomes the limitations of BERT thanks to its autoregressive
+formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the state-of-the-art autoregressive model,
+into pretraining. Empirically, under comparable experiment settings, XLNet outperforms BERT on 20 tasks, often by
+a large margin, including question answering, natural language inference, sentiment analysis, and document ranking.*
+
+Tips:
+
+- The specific attention pattern can be controlled at training and test time using the `perm_mask` input.
+- Due to the difficulty of training a fully auto-regressive model over various factorization order,
+  XLNet is pretrained using only a sub-set of the output tokens as target which are selected
+  with the `target_mapping` input.
+- To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and
+  `target_mapping` inputs to control the attention span and outputs (see examples in `examples/run_generation.py`)
+- XLNet is one of the few models that has no sequence length limit.
+
+
+XLNetConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.XLNetConfig
    :members:


-``XLNetTokenizer``
+XLNetTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.XLNetTokenizer
    :members:


-``XLNetModel``
+XLNetModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.XLNetModel
    :members:


-``XLNetLMHeadModel``
+XLNetLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.XLNetLMHeadModel
    :members:


-``XLNetForSequenceClassification``
+XLNetForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.XLNetForSequenceClassification
    :members:


-``XLNetForQuestionAnswering``
+XLNetForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLNetForTokenClassification
+    :members:
+
+
+XLNetForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLNetForMultipleChoice
+    :members:
+
+
+XLNetForQuestionAnsweringSimple
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLNetForQuestionAnsweringSimple
+    :members:
+
+
+XLNetForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.XLNetForQuestionAnswering
    :members:


-``TFXLNetModel``
+TFXLNetModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFXLNetModel
    :members:


-``TFXLNetLMHeadModel``
+TFXLNetLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFXLNetLMHeadModel
    :members:


-``TFXLNetForSequenceClassification``
+TFXLNetForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFXLNetForSequenceClassification
    :members:


-``TFXLNetForQuestionAnsweringSimple``
+TFXLNetForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFXLNetForQuestionAnsweringSimple
--- a/docs/source/model_sharing.md
+++ b/docs/source/model_sharing.md
@@ -0,0 +1,48 @@
+# Model upload and sharing
+
+Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
+
+**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
+
+```shell
+transformers-cli login
+# log in using the same credentials as on huggingface.co
+```
+Upload your model:
+```shell
+transformers-cli upload ./path/to/pretrained_model/
+
+# ^^ Upload folder containing weights/tokenizer/config
+# saved via `.save_pretrained()`
+
+transformers-cli upload ./config.json [--filename folder/foobar.json]
+
+# ^^ Upload a single file
+# (you can optionally override its filename, which can be nested inside a folder)
+```
+
+Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
+```python
+"username/pretrained_model"
+```
+
+**Please add a README.md model card** to the repo under `model_cards/` with: model description, training params (dataset, preprocessing, hyperparameters), evaluation results, intended uses & limitations, etc.
+
+Your model now has a page on huggingface.co/models 🔥
+
+Anyone can load it from code:
+```python
+tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
+model = AutoModel.from_pretrained("username/pretrained_model")
+```
+
+List all your files on S3:
+```shell
+transformers-cli s3 ls
+```
+
+You can also delete unneeded files:
+
+```shell
+transformers-cli s3 rm …
+```
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -3,6 +3,7 @@ Pretrained models

 Here is the full list of the currently provided pretrained models together with a short presentation of each model.

+For a list that includes community-uploaded models, refer to `https://huggingface.co/models <https://huggingface.co/models>`__.

 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | Architecture      | Shortcut name                                              | Details of the model                                                                                                                  |
@@ -53,6 +54,44 @@ Here is the full list of the currently provided pretrained models together with
 |                   | ``bert-base-cased-finetuned-mrpc``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | The ``bert-base-cased`` model fine-tuned on MRPC                                                                                    |
 |                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-german-dbmdz-cased``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on cased German text by DBMDZ                                                                                               |
+|                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-german-dbmdz-uncased``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on uncased German text by DBMDZ                                                                                             |
+|                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-japanese``                                     | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece.                                                               |
+|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-japanese-whole-word-masking``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece.                                      |
+|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-japanese-char``                                | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on Japanese text. Text is tokenized into characters.                                                                        |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-japanese-char-whole-word-masking``             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                               |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-finnish-cased-v1``                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on cased Finnish text.                                                                                                      |
+|                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-finnish-uncased-v1``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on uncased Finnish text.                                                                                                    |
+|                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-dutch-cased``                                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on cased Dutch text.                                                                                                        |
+|                   |                                                            | (see `details on wietsedv repository <https://github.com/wietsedv/bertje/>`__).                                                       |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | OpenAI GPT English model                                                                                                            |
@@ -65,6 +104,9 @@ Here is the full list of the currently provided pretrained models together with
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``gpt2-large``                                             | | 36-layer, 1280-hidden, 20-heads, 774M parameters.                                                                                   |
 |                   |                                                            | | OpenAI's Large-sized GPT-2 English model                                                                                            |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``gpt2-xl``                                                | | 48-layer, 1600-hidden, 25-heads, 1558M parameters.                                                                                  |
+|                   |                                                            | | OpenAI's XL-sized GPT-2 English model                                                                                               |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | Transformer-XL    | ``transfo-xl-wt103``                                       | | 18-layer, 1024-hidden, 16-heads, 257M parameters.                                                                                   |
 |                   |                                                            | | English model trained on wikitext-103                                                                                               |
@@ -116,6 +158,18 @@ Here is the full list of the currently provided pretrained models together with
 |                   | ``roberta-large-mnli``                                     | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
 |                   |                                                            | | ``roberta-large`` fine-tuned on `MNLI <http://www.nyu.edu/projects/bowman/multinli/>`__.                                            |
 |                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``distilroberta-base``                                     | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
+|                   |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``roberta-base-openai-detector``                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
+|                   |                                                            | | ``roberta-base`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model.                                             |
+|                   |                                                            | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__)                                               |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``roberta-large-openai-detector``                          | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
+|                   |                                                            | | ``roberta-large`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model.                                            |
+|                   |                                                            | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__)                                               |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | DistilBERT        | ``distilbert-base-uncased``                                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint                                                   |
@@ -125,12 +179,102 @@ Here is the full list of the currently provided pretrained models together with
 |                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer.                 |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``distilbert-base-cased``                                  | | 6-layer, 768-hidden, 12-heads, 65M parameters                                                                                       |
+|                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint                                                     |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``distilbert-base-cased-distilled-squad``                  | | 6-layer, 768-hidden, 12-heads, 65M parameters                                                                                       |
+|                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint, with an additional question answering layer.       |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilgpt2``                                             | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
 |                   |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``distilbert-base-german-cased``                           | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
+|                   |                                                            | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint.                   |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``distilbert-base-multilingual-cased``                     | | 6-layer, 768-hidden, 12-heads, 134M parameters                                                                                      |
+|                   |                                                            | | The multilingual DistilBERT model distilled from the Multilingual BERT model `bert-base-multilingual-cased` checkpoint.             |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | CTRL              | ``ctrl``                                                   | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters                                                                                    |
 |                   |                                                            | | Salesforce's Large-sized CTRL English model                                                                                         |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| CamemBERT         | ``camembert-base``                                         | | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                                     |
+|                   |                                                            | | CamemBERT using the BERT-base architecture                                                                                          |
+|                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/camembert>`__)                                                 |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| ALBERT            | ``albert-base-v1``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
+|                   |                                                            | | ALBERT base model                                                                                                                   |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``albert-large-v1``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
+|                   |                                                            | | ALBERT large model                                                                                                                  |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``albert-xlarge-v1``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
+|                   |                                                            | | ALBERT xlarge model                                                                                                                 |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``albert-xxlarge-v1``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
+|                   |                                                            | | ALBERT xxlarge model                                                                                                                |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``albert-base-v2``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
+|                   |                                                            | | ALBERT base model with no dropout, additional training data and longer training                                                     |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``albert-large-v2``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
+|                   |                                                            | | ALBERT large model with no dropout, additional training data and longer training                                                    |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``albert-xlarge-v2``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
+|                   |                                                            | | ALBERT xlarge model with no dropout, additional training data and longer training                                                   |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``albert-xxlarge-v2``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
+|                   |                                                            | | ALBERT xxlarge model with no dropout, additional training data and longer training                                                  |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| T5                | ``t5-small``                                               | | ~60M parameters with 6-layers, 512-hidden-state, 2048 feed-forward hidden-state, 8-heads,                                           |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``t5-base``                                                | | ~220M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 12-heads,                                        |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``t5-large``                                               | | ~770M parameters with 24-layers, 1024-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``t5-3B``                                                  | | ~2.8B parameters with 24-layers, 1024-hidden-state, 16384 feed-forward hidden-state, 32-heads,                                      |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``t5-11B``                                                 | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads,                                      |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| XLM-RoBERTa       | ``xlm-roberta-base``                                       | | ~125M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 8-heads,                                         |
+|                   |                                                            | | Trained on on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                       |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-roberta-large``                                      | | ~355M parameters with 24-layers, 1027-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
+|                   |                                                            | | Trained on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                          |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| FlauBERT          | ``flaubert-small-cased``                                   | | 6-layer, 512-hidden, 8-heads, 54M parameters                                                                                        |
+|                   |                                                            | | FlauBERT small architecture                                                                                                         |
+|                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``flaubert-base-uncased``                                  | | 12-layer, 768-hidden, 12-heads, 137M parameters                                                                                     |
+|                   |                                                            | | FlauBERT base architecture with uncased vocabulary                                                                                  |
+|                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``flaubert-base-cased``                                    | | 12-layer, 768-hidden, 12-heads, 138M parameters                                                                                     |
+|                   |                                                            | | FlauBERT base architecture with cased vocabulary                                                                                    |
+|                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``flaubert-large-cased``                                   | | 24-layer, 1024-hidden, 16-heads, 373M parameters                                                                                    |
+|                   |                                                            | | FlauBERT large architecture                                                                                                         |
+|                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+

-.. <https://huggingface.co/transformers/examples.html>`__
+
+.. <https://huggingface.co/transformers/examples.html>`__
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -188,3 +188,128 @@ assert predicted_text == 'Who was Jim Henson? Jim Henson was a man'
 ```

 Examples for each model class of each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [documentation](#documentation).
+
+#### Using the past
+
+GPT-2 as well as some other models (GPT, XLNet, Transfo-XL, CTRL) make use of a `past` or `mems` attribute which can be used to prevent re-computing the key/value pairs when using sequential decoding. It is useful when generating sequences as a big part of the attention mechanism benefits from previous computations.
+
+Here is a fully-working example using the `past` with `GPT2LMHeadModel` and argmax decoding (which should only be used as an example, as argmax decoding introduces a lot of repetition):
+
+```python
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
+import torch
+
+tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+model = GPT2LMHeadModel.from_pretrained('gpt2')
+
+generated = tokenizer.encode("The Manhattan bridge")
+context = torch.tensor([generated])
+past = None
+
+for i in range(100):
+    print(i)
+    output, past = model(context, past=past)
+    token = torch.argmax(output[..., -1, :])
+
+    generated += [token.tolist()]
+    context = token.unsqueeze(0)
+
+sequence = tokenizer.decode(generated)
+
+print(sequence)
+```
+
+The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
+
+### Model2Model example
+
+Encoder-decoder architectures require two tokenized inputs: one for the encoder and the other one for the decoder. Let's assume that we want to use `Model2Model` for generative question answering, and start by tokenizing the question and answer that will be fed to the model.
+
+```python
+import torch
+from transformers import BertTokenizer, Model2Model
+
+# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
+import logging
+logging.basicConfig(level=logging.INFO)
+
+# Load pre-trained model tokenizer (vocabulary)
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+# Encode the input to the encoder (the question)
+question = "Who was Jim Henson?"
+encoded_question = tokenizer.encode(question)
+
+# Encode the input to the decoder (the answer)
+answer = "Jim Henson was a puppeteer"
+encoded_answer = tokenizer.encode(answer)
+
+# Convert inputs to PyTorch tensors
+question_tensor = torch.tensor([encoded_question])
+answer_tensor = torch.tensor([encoded_answer])
+```
+
+Let's see how we can use `Model2Model` to get the value of the loss associated with this (question, answer) pair:
+
+```python
+# In order to compute the loss we need to provide language model
+# labels (the token ids that the model should have produced) to
+# the decoder.
+lm_labels =  encoded_answer
+labels_tensor = torch.tensor([lm_labels])
+
+# Load pre-trained model (weights)
+model = Model2Model.from_pretrained('bert-base-uncased')
+
+# Set the model in evaluation mode to deactivate the DropOut modules
+# This is IMPORTANT to have reproducible results during evaluation!
+model.eval()
+
+# If you have a GPU, put everything on cuda
+question_tensor = question_tensor.to('cuda')
+answer_tensor = answer_tensor.to('cuda')
+labels_tensor = labels_tensor.to('cuda')
+model.to('cuda')
+
+# Predict hidden states features for each layer
+with torch.no_grad():
+    # See the models docstrings for the detail of the inputs
+    outputs = model(question_tensor, answer_tensor, decoder_lm_labels=labels_tensor)
+    # Transformers models always output tuples.
+    # See the models docstrings for the detail of all the outputs
+    # In our case, the first element is the value of the LM loss 
+    lm_loss = outputs[0]
+```
+
+This loss can be used to fine-tune `Model2Model` on the question answering task. Assuming that we fine-tuned the model, let us now see how to generate an answer:
+
+```python
+# Let's re-use the previous question
+question = "Who was Jim Henson?"
+encoded_question = tokenizer.encode(question)
+question_tensor = torch.tensor([encoded_question])
+
+# This time we try to generate the answer, so we start with an empty sequence
+answer = "[CLS]"
+encoded_answer = tokenizer.encode(answer, add_special_tokens=False)
+answer_tensor = torch.tensor([encoded_answer])
+
+# Load pre-trained model (weights)
+model = Model2Model.from_pretrained('fine-tuned-weights')
+model.eval()
+
+# If you have a GPU, put everything on cuda
+question_tensor = question_tensor.to('cuda')
+answer_tensor = answer_tensor.to('cuda')
+model.to('cuda')
+
+# Predict all tokens
+with torch.no_grad():
+    outputs = model(question_tensor, answer_tensor)
+    predictions = outputs[0]
+
+# confirm we were able to predict 'jim'
+predicted_index = torch.argmax(predictions[0, -1]).item()
+predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+assert predicted_token == 'jim'
+```
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -33,6 +33,8 @@ where
    * ``bert-large-uncased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
    * ``bert-large-cased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
    * ``bert-large-uncased-whole-word-masking-finetuned-squad``: The ``bert-large-uncased-whole-word-masking`` model finetuned on SQuAD (using the ``run_bert_squad.py`` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
+    * ``bert-base-german-dbmdz-cased``: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://github.com/dbmdz/german-bert>`__
+    * ``bert-base-german-dbmdz-uncased``: Trained on (uncased) German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://github.com/dbmdz/german-bert>`__
    * ``openai-gpt``: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
    * ``gpt2``: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
    * ``gpt2-medium``: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
@@ -104,7 +106,7 @@ This section explain how you can save and re-load a fine-tuned model (BERT, GPT,
 There are three types of files you need to save to be able to reload a fine-tuned model:


-* the model it-self which should be saved following PyTorch serialization `best practices <https://pytorch.org/docs/stable/notes/serialization.html#best-practices>`__\ ,
+* the model itself which should be saved following PyTorch serialization `best practices <https://pytorch.org/docs/stable/notes/serialization.html#best-practices>`__\ ,
 * the configuration file of the model which is saved as a JSON file, and
 * the vocabulary (and the merges for the BPE-based models GPT and GPT-2).

--- a/examples/README.md
+++ b/examples/README.md
@@ -3,24 +3,61 @@
 In this section a few examples are put together. All of these examples work for several models, making use of the very
 similar API between the different models.

+**Important**  
+To run the latest versions of the examples, you have to install from source and install some specific requirements for the examples.
+Execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/transformers
+cd transformers
+pip install .
+pip install -r ./examples/requirements.txt
+```
+
 | Section                    | Description                                                                                                                                                |
 |----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| [Language Model fine-tuning](#language-model-fine-tuning) | Fine-tuning the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. |
-| [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.                                         |
-| [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision.                              |
-| [SQuAD](#squad) | Using BERT for question answering, examples with distributed training.                                                                                  |
-| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
+| [TensorFlow 2.0 models on GLUE](#TensorFlow-2.0-Bert-models-on-GLUE) | Examples running BERT TensorFlow 2.0 model on the GLUE tasks. 
+| [Language Model training](#language-model-training) | Fine-tuning (or training from scratch) the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. |
+| [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet. |
+| [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision. |
+| [SQuAD](#squad) | Using BERT/RoBERTa/XLNet/XLM for question answering, examples with distributed training. |
+| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. |
+| [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training. |
+| [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
+| [Adversarial evaluation of model performances](#adversarial-evaluation-of-model-performances) | Testing a model with adversarial evaluation of natural language inference on the Heuristic Analysis for NLI Systems (HANS) dataset (McCoy et al., 2019.) |

-## Language model fine-tuning
+## TensorFlow 2.0 Bert models on GLUE

-Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_lm_finetuning.py).
+Based on the script [`run_tf_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_glue.py).

-Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT 
+Fine-tuning the library TensorFlow 2.0 Bert model for sequence classification on the  MRPC task of the GLUE benchmark: [General Language Understanding Evaluation](https://gluebenchmark.com/).
+
+This script has an option for mixed precision (Automatic Mixed Precision / AMP) to run models on Tensor Cores (NVIDIA Volta/Turing GPUs) and future hardware and an option for XLA, which uses the XLA compiler to reduce model runtime.
+Options are toggled using `USE_XLA` or `USE_AMP` variables in the script.
+These options and the below benchmark are provided by @tlkh.
+
+Quick benchmarks from the script (no other modifications):
+
+| GPU    | Mode | Time (2nd epoch) | Val Acc (3 runs) |
+| --------- | -------- | ----------------------- | ----------------------|
+| Titan V | FP32 | 41s | 0.8438/0.8281/0.8333 |
+| Titan V | AMP | 26s | 0.8281/0.8568/0.8411 |
+| V100    | FP32 | 35s | 0.8646/0.8359/0.8464 |
+| V100    | AMP | 22s | 0.8646/0.8385/0.8411 |
+| 1080 Ti | FP32 | 55s | - |
+
+Mixed precision (AMP) reduces the training time considerably for the same hardware and hyper-parameters (same batch size was used).
+
+## Language model training
+
+Based on the script [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/run_language_modeling.py).
+
+Fine-tuning (or training from scratch) the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT 
 to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa 
 are fine-tuned using a masked language modeling (MLM) loss.

 Before running the following example, you should get a file that contains text on which the language model will be
-fine-tuned. A good example of such text is the [WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
+trained or fine-tuned. A good example of such text is the [WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).

 We will refer to two different files: `$TRAIN_FILE`, which contains text for training, and `$TEST_FILE`, which contains
 text that will be used for evaluation.
@@ -34,7 +71,7 @@ the tokenization). The loss here is that of causal language modeling.
 export TRAIN_FILE=/path/to/dataset/wiki.train.raw
 export TEST_FILE=/path/to/dataset/wiki.test.raw

-python run_lm_finetuning.py \
+python run_language_modeling.py \
    --output_dir=output \
    --model_type=gpt2 \
    --model_name_or_path=gpt2 \
@@ -62,7 +99,7 @@ We use the `--mlm` flag so that the script may change its loss function.
 export TRAIN_FILE=/path/to/dataset/wiki.train.raw
 export TEST_FILE=/path/to/dataset/wiki.test.raw

-python run_lm_finetuning.py \
+python run_language_modeling.py \
    --output_dir=output \
    --model_type=roberta \
    --model_name_or_path=roberta-base \
@@ -77,7 +114,7 @@ python run_lm_finetuning.py \

 Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py).

-Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
+Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL.
 A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
 can try out the different models available in the library.

@@ -97,26 +134,26 @@ Fine-tuning the library models for sequence classification on the GLUE benchmark
 Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa. 

 GLUE is made up of a total of 9 different tasks. We get the following results on the dev set of the benchmark with an
-uncased  BERT base model (the checkpoint `bert-base-uncased`). All experiments ran on 8 V100 GPUs with a total train
-batch size of 24. Some of these tasks have a small dataset and training can lead to high variance in the results
+uncased  BERT base model (the checkpoint `bert-base-uncased`). All experiments ran single V100 GPUs with a total train
+batch sizes between 16 and 64. Some of these tasks have a small dataset and training can lead to high variance in the results
 between different runs. We report the median on 5 runs (with different seeds) for each of the metrics.

 | Task  | Metric                       | Result      |
 |-------|------------------------------|-------------|
-| CoLA  | Matthew's corr               | 48.87       |
-| SST-2 | Accuracy                     | 91.74       |
-| MRPC  | F1/Accuracy                  | 90.70/86.27 |
-| STS-B | Person/Spearman corr.        | 91.39/91.04 |
-| QQP   | Accuracy/F1                  | 90.79/87.66 |
-| MNLI  | Matched acc./Mismatched acc. | 83.70/84.83 |
-| QNLI  | Accuracy                     | 89.31       |
-| RTE   | Accuracy                     | 71.43       |
-| WNLI  | Accuracy                     | 43.66       |
+| CoLA  | Matthew's corr               | 49.23       |
+| SST-2 | Accuracy                     | 91.97       |
+| MRPC  | F1/Accuracy                  | 89.47/85.29 |
+| STS-B | Person/Spearman corr.        | 83.95/83.70 |
+| QQP   | Accuracy/F1                  | 88.40/84.31 |
+| MNLI  | Matched acc./Mismatched acc. | 80.61/81.08 |
+| QNLI  | Accuracy                     | 87.46       |
+| RTE   | Accuracy                     | 61.73       |
+| WNLI  | Accuracy                     | 45.07       |

 Some of these results are significantly different from the ones reported on the test set
 of GLUE benchmark on the website. For QQP and WNLI, please refer to [FAQ #12](https://gluebenchmark.com/faq) on the webite.

-Before running anyone of these GLUE tasks you should download the
+Before running any one of these GLUE tasks you should download the
 [GLUE data](https://gluebenchmark.com/tasks) by running
 [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
 and unpack it to some directory `$GLUE_DIR`.
@@ -158,7 +195,7 @@ since the data processor for each task inherits from the base class DataProcesso
 The following examples fine-tune BERT on the Microsoft Research Paraphrase Corpus (MRPC) corpus and runs in less 
 than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.

-Before running anyone of these GLUE tasks you should download the
+Before running any one of these GLUE tasks you should download the
 [GLUE data](https://gluebenchmark.com/tasks) by running
 [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
 and unpack it to some directory `$GLUE_DIR`.
@@ -321,9 +358,9 @@ eval_loss = 0.44457291918821606

 Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py).

-#### Fine-tuning on SQuAD
+#### Fine-tuning BERT on SQuAD1.0

-This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) 
+This example code fine-tunes BERT on the SQuAD1.0 dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) 
 on a single tesla V100 16GB. The data for SQuAD can be downloaded with the following links and should be saved in a 
 $SQUAD_DIR directory.

@@ -331,6 +368,12 @@ $SQUAD_DIR directory.
 * [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
 * [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)

+And for SQuAD2.0, you need to download:
+
+- [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
+- [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
+- [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
+
 ```bash
 export SQUAD_DIR=/path/to/SQUAD

@@ -360,12 +403,12 @@ exact_match = 81.22
 #### Distributed training


-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:

 ```bash
-python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \
+python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \
    --model_type bert \
-    --model_name_or_path bert-base-cased \
+    --model_name_or_path bert-large-uncased-whole-word-masking \
    --do_train \
    --do_eval \
    --do_lower_case \
@@ -375,9 +418,9 @@ python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \
    --num_train_epochs 2 \
    --max_seq_length 384 \
    --doc_stride 128 \
-    --output_dir ../models/wwm_uncased_finetuned_squad/ \
-    --per_gpu_train_batch_size 24 \
-    --gradient_accumulation_steps 12
+    --output_dir ./examples/models/wwm_uncased_finetuned_squad/ \
+    --per_gpu_eval_batch_size=3   \
+    --per_gpu_train_batch_size=3   \
 ```

 Training with the previously defined hyper-parameters yields the following results:
@@ -387,6 +430,371 @@ f1 = 93.15
 exact_match = 86.91
 ```

-This fine-tuneds model is available as a checkpoint under the reference
+This fine-tuned model is available as a checkpoint under the reference
 `bert-large-uncased-whole-word-masking-finetuned-squad`.

+#### Fine-tuning XLNet on SQuAD
+
+This example code fine-tunes XLNet on both SQuAD1.0 and SQuAD2.0 dataset. See above to download the data for SQuAD .
+
+##### Command for SQuAD1.0:
+
+```bash
+export SQUAD_DIR=/path/to/SQUAD
+
+python /data/home/hlu/transformers/examples/run_squad.py \
+    --model_type xlnet \
+    --model_name_or_path xlnet-large-cased \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --train_file /data/home/hlu/notebooks/NLP/examples/question_answering/train-v1.1.json \
+    --predict_file /data/home/hlu/notebooks/NLP/examples/question_answering/dev-v1.1.json \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir ./wwm_cased_finetuned_squad/ \
+    --per_gpu_eval_batch_size=4  \
+    --per_gpu_train_batch_size=4   \
+    --save_steps 5000
+```
+
+##### Command for SQuAD2.0:
+
+```bash
+export SQUAD_DIR=/path/to/SQUAD
+
+python run_squad.py \
+    --model_type xlnet \
+    --model_name_or_path xlnet-large-cased \
+    --do_train \
+    --do_eval \
+    --version_2_with_negative \
+    --train_file $SQUAD_DIR/train-v2.0.json \
+    --predict_file $SQUAD_DIR/dev-v2.0.json \
+    --learning_rate 3e-5 \
+    --num_train_epochs 4 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir ./wwm_cased_finetuned_squad/ \
+    --per_gpu_eval_batch_size=2  \
+    --per_gpu_train_batch_size=2   \
+    --save_steps 5000
+```
+
+Larger batch size may improve the performance while costing more memory.
+
+##### Results for SQuAD1.0 with the previously defined hyper-parameters:
+
+```python
+{
+"exact": 85.45884578997162,
+"f1": 92.5974600601065,
+"total": 10570,
+"HasAns_exact": 85.45884578997162,
+"HasAns_f1": 92.59746006010651,
+"HasAns_total": 10570
+}
+```
+
+##### Results for SQuAD2.0 with the previously defined hyper-parameters:
+
+```python
+{
+"exact": 80.4177545691906,
+"f1": 84.07154997729623,
+"total": 11873,
+"HasAns_exact": 76.73751686909581,
+"HasAns_f1": 84.05558584352873,
+"HasAns_total": 5928,
+"NoAns_exact": 84.0874684608915,
+"NoAns_f1": 84.0874684608915,
+"NoAns_total": 5945
+}
+```
+
+
+
+## Named Entity Recognition
+
+Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) for Pytorch and
+[`run_tf_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py) for Tensorflow 2.
+This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
+Details and results for the fine-tuning provided by @stefan-it.
+
+### Data (Download and pre-processing steps)
+
+Data can be obtained from the [GermEval 2014](https://sites.google.com/site/germeval2014ner/data) shared task page.
+
+Here are the commands for downloading and pre-processing train, dev and test datasets. The original data format has four (tab-separated) columns, in a pre-processing step only the two relevant columns (token and outer span NER annotation) are extracted:
+
+```bash
+curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1' \
+| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
+curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attredirects=0&d=1' \
+| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
+curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1' \
+| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
+```
+
+The GermEval 2014 dataset contains some strange "control character" tokens like `'\x96', '\u200e', '\x95', '\xad' or '\x80'`. One problem with these tokens is, that `BertTokenizer` returns an empty token for them, resulting in misaligned `InputExample`s. I wrote a script that a) filters these tokens and b) splits longer sentences into smaller ones (once the max. subtoken length is reached).
+
+```bash
+wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py"
+```
+Let's define some variables that we need for further pre-processing steps and training the model:
+
+```bash
+export MAX_LENGTH=128
+export BERT_MODEL=bert-base-multilingual-cased
+```
+
+Run the pre-processing script on training, dev and test datasets:
+
+```bash
+python3 preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
+python3 preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
+python3 preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
+```
+
+The GermEval 2014 dataset has much more labels than CoNLL-2002/2003 datasets, so an own set of labels must be used:
+
+```bash
+cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
+```
+
+### Prepare the run
+
+Additional environment variables must be set:
+
+```bash
+export OUTPUT_DIR=germeval-model
+export BATCH_SIZE=32
+export NUM_EPOCHS=3
+export SAVE_STEPS=750
+export SEED=1
+```
+
+### Run the Pytorch version
+
+To start training, just run:
+
+```bash
+python3 run_ner.py --data_dir ./ \
+--model_type bert \
+--labels ./labels.txt \
+--model_name_or_path $BERT_MODEL \
+--output_dir $OUTPUT_DIR \
+--max_seq_length  $MAX_LENGTH \
+--num_train_epochs $NUM_EPOCHS \
+--per_gpu_train_batch_size $BATCH_SIZE \
+--save_steps $SAVE_STEPS \
+--seed $SEED \
+--do_train \
+--do_eval \
+--do_predict
+```
+
+If your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
+
+#### Evaluation
+
+Evaluation on development dataset outputs the following for our example:
+
+```bash
+10/04/2019 00:42:06 - INFO - __main__ -   ***** Eval results  *****
+10/04/2019 00:42:06 - INFO - __main__ -     f1 = 0.8623348017621146
+10/04/2019 00:42:06 - INFO - __main__ -     loss = 0.07183869666975543
+10/04/2019 00:42:06 - INFO - __main__ -     precision = 0.8467916366258111
+10/04/2019 00:42:06 - INFO - __main__ -     recall = 0.8784592370979806
+```
+
+On the test dataset the following results could be achieved:
+
+```bash
+10/04/2019 00:42:42 - INFO - __main__ -   ***** Eval results  *****
+10/04/2019 00:42:42 - INFO - __main__ -     f1 = 0.8614389652384803
+10/04/2019 00:42:42 - INFO - __main__ -     loss = 0.07064602487454782
+10/04/2019 00:42:42 - INFO - __main__ -     precision = 0.8604651162790697
+10/04/2019 00:42:42 - INFO - __main__ -     recall = 0.8624150210424085
+```
+
+#### Comparing BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased)
+
+Here is a small comparison between BERT (large, cased), RoBERTa (large, cased) and DistilBERT (base, uncased) with the same hyperparameters as specified in the [example documentation](https://huggingface.co/transformers/examples.html#named-entity-recognition) (one run):
+
+| Model | F-Score Dev | F-Score Test
+| --------------------------------- | ------- | --------
+| `bert-large-cased`            | 95.59 | 91.70
+| `roberta-large`                  | 95.96 | 91.87
+| `distilbert-base-uncased` | 94.34 | 90.32
+
+### Run the Tensorflow 2 version
+
+To start training, just run:
+
+```bash
+python3 run_tf_ner.py --data_dir ./ \
+--model_type bert \
+--labels ./labels.txt \
+--model_name_or_path $BERT_MODEL \
+--output_dir $OUTPUT_DIR \
+--max_seq_length  $MAX_LENGTH \
+--num_train_epochs $NUM_EPOCHS \
+--per_device_train_batch_size $BATCH_SIZE \
+--save_steps $SAVE_STEPS \
+--seed $SEED \
+--do_train \
+--do_eval \
+--do_predict
+```
+
+Such as the Pytorch version, if your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
+
+#### Evaluation
+
+Evaluation on development dataset outputs the following for our example:
+```bash
+           precision    recall  f1-score   support
+
+ LOCderiv     0.7619    0.6154    0.6809        52
+  PERpart     0.8724    0.8997    0.8858      4057
+  OTHpart     0.9360    0.9466    0.9413       711
+  ORGpart     0.7015    0.6989    0.7002       269
+  LOCpart     0.7668    0.8488    0.8057       496
+      LOC     0.8745    0.9191    0.8963       235
+ ORGderiv     0.7723    0.8571    0.8125        91
+ OTHderiv     0.4800    0.6667    0.5581        18
+      OTH     0.5789    0.6875    0.6286        16
+ PERderiv     0.5385    0.3889    0.4516        18
+      PER     0.5000    0.5000    0.5000         2
+      ORG     0.0000    0.0000    0.0000         3
+
+micro avg     0.8574    0.8862    0.8715      5968
+macro avg     0.8575    0.8862    0.8713      5968
+```
+
+On the test dataset the following results could be achieved:
+```bash
+           precision    recall  f1-score   support
+
+  PERpart     0.8847    0.8944    0.8896      9397
+  OTHpart     0.9376    0.9353    0.9365      1639
+  ORGpart     0.7307    0.7044    0.7173       697
+      LOC     0.9133    0.9394    0.9262       561
+  LOCpart     0.8058    0.8157    0.8107      1150
+      ORG     0.0000    0.0000    0.0000         8
+ OTHderiv     0.5882    0.4762    0.5263        42
+ PERderiv     0.6571    0.5227    0.5823        44
+      OTH     0.4906    0.6667    0.5652        39
+ ORGderiv     0.7016    0.7791    0.7383       172
+ LOCderiv     0.8256    0.6514    0.7282       109
+      PER     0.0000    0.0000    0.0000        11
+
+micro avg     0.8722    0.8774    0.8748     13869
+macro avg     0.8712    0.8774    0.8740     13869
+```
+
+## XNLI
+
+Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/blob/master/examples/run_xnli.py).
+
+[XNLI](https://www.nyu.edu/projects/bowman/xnli/) is crowd-sourced dataset based on [MultiNLI](http://www.nyu.edu/projects/bowman/multinli/). It is an evaluation benchmark for cross-lingual text representations. Pairs of text are labeled with textual entailment annotations for 15 different languages (including both high-resource language such as English and low-resource languages such as Swahili).
+
+#### Fine-tuning on XNLI
+
+This example code fine-tunes mBERT (multi-lingual BERT) on the XNLI dataset. It runs in 106 mins
+on a single tesla V100 16GB. The data for XNLI can be downloaded with the following links and should be both saved (and un-zipped) in a 
+`$XNLI_DIR` directory.
+
+* [XNLI 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-1.0.zip)
+* [XNLI-MT 1.0](https://www.nyu.edu/projects/bowman/xnli/XNLI-MT-1.0.zip)
+
+```bash
+export XNLI_DIR=/path/to/XNLI
+
+python run_xnli.py \
+  --model_type bert \
+  --model_name_or_path bert-base-multilingual-cased \
+  --language de \
+  --train_language en \
+  --do_train \
+  --do_eval \
+  --data_dir $XNLI_DIR \
+  --per_gpu_train_batch_size 32 \
+  --learning_rate 5e-5 \
+  --num_train_epochs 2.0 \
+  --max_seq_length 128 \
+  --output_dir /tmp/debug_xnli/ \
+  --save_steps -1
+```
+
+Training with the previously defined hyper-parameters yields the following results on the **test** set:
+
+```bash
+acc = 0.7093812375249501
+```
+
+## MM-IMDb
+
+Based on the script [`run_mmimdb.py`](https://github.com/huggingface/transformers/blob/master/examples/mm-imdb/run_mmimdb.py).
+
+[MM-IMDb](http://lisi1.unal.edu.co/mmimdb/) is a Multimodal dataset with around 26,000 movies including images, plots and other metadata.
+
+### Training on MM-IMDb
+
+```
+python run_mmimdb.py \
+    --data_dir /path/to/mmimdb/dataset/ \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --output_dir /path/to/save/dir/ \
+    --do_train \
+    --do_eval \
+    --max_seq_len 512 \
+    --gradient_accumulation_steps 20 \
+    --num_image_embeds 3 \
+    --num_train_epochs 100 \
+    --patience 5
+```
+
+## Adversarial evaluation of model performances
+
+Here is an example on evaluating a model using adversarial evaluation of natural language inference with the Heuristic Analysis for NLI Systems (HANS) dataset [McCoy et al., 2019](https://arxiv.org/abs/1902.01007). The example was gracefully provided by [Nafise Sadat Moosavi](https://github.com/ns-moosavi).
+
+The HANS dataset can be downloaded from [this location](https://github.com/tommccoy1/hans).
+
+This is an example of using test_hans.py:
+
+```bash
+export HANS_DIR=path-to-hans
+export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc
+export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py
+
+python examples/hans/test_hans.py \
+        --task_name hans \
+        --model_type $MODEL_TYPE \
+        --do_eval \
+        --do_lower_case \
+        --data_dir $HANS_DIR \
+        --model_name_or_path $MODEL_PATH \
+        --max_seq_length 128 \
+        --output_dir $MODEL_PATH \
+```
+
+This will create the hans_predictions.txt file in MODEL_PATH, which can then be evaluated using hans/evaluate_heur_output.py from the HANS dataset.
+
+The results of the BERT-base model that is trained on MNLI using batch size 8 and the random seed 42 on the HANS dataset is as follows:
+
+```bash
+Heuristic entailed results:
+lexical_overlap: 0.9702
+subsequence: 0.9942
+constituent: 0.9962
+
+Heuristic non-entailed results:
+lexical_overlap: 0.199
+subsequence: 0.0396
+constituent: 0.118
+```
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -0,0 +1,531 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Benchmarking the library on inference and training """
+
+# If checking the tensors placement
+# tf.debugging.set_log_device_placement(True)
+
+import argparse
+import csv
+import timeit
+from time import time
+from typing import List
+
+from transformers import AutoConfig, AutoTokenizer, is_tf_available, is_torch_available
+
+
+if is_tf_available():
+    import tensorflow as tf
+    from transformers import TFAutoModel
+
+if is_torch_available():
+    import torch
+    from transformers import AutoModel
+
+
+input_text = """Bent over their instruments, three hundred Fertilizers were plunged, as
+the Director of Hatcheries and Conditioning entered the room, in the
+
+
+
+scarcely breathing silence, the absent-minded, soliloquizing hum or
+whistle, of absorbed concentration. A troop of newly arrived students,
+very young, pink and callow, followed nervously, rather abjectly, at the
+Director's heels. Each of them carried a notebook, in which, whenever
+the great man spoke, he desperately scribbled. Straight from the
+horse's mouth. It was a rare privilege. The D. H. C. for Central London
+always made a point of personally conducting his new students round
+the various departments.
+
+"Just to give you a general idea," he would explain to them. For of
+course some sort of general idea they must have, if they were to do
+their work intelligently-though as little of one, if they were to be good
+and happy members of society, as possible. For particulars, as every
+one knows, make for virtue and happiness; generalities are intellectu-
+ally necessary evils. Not philosophers but fret-sawyers and stamp col-
+lectors compose the backbone of society.
+
+"To-morrow," he would add, smiling at them with a slightly menacing
+geniality, "you'll be settling down to serious work. You won't have time
+for generalities. Meanwhile ..."
+
+Meanwhile, it was a privilege. Straight from the horse's mouth into the
+notebook. The boys scribbled like mad.
+
+Tall and rather thin but upright, the Director advanced into the room.
+He had a long chin and big rather prominent teeth, just covered, when
+he was not talking, by his full, floridly curved lips. Old, young? Thirty?
+Fifty? Fifty-five? It was hard to say. And anyhow the question didn't
+arise; in this year of stability, A. F. 632, it didn't occur to you to ask it.
+
+"I shall begin at the beginning," said the D.H.C. and the more zealous
+students recorded his intention in their notebooks: Begin at the begin-
+ning. "These," he waved his hand, "are the incubators." And opening
+an insulated door he showed them racks upon racks of numbered test-
+tubes. "The week's supply of ova. Kept," he explained, "at blood heat;
+whereas the male gametes," and here he opened another door, "they
+have to be kept at thirty-five instead of thirty-seven. Full blood heat
+sterilizes." Rams wrapped in theremogene beget no lambs.
+
+Still leaning against the incubators he gave them, while the pencils
+scurried illegibly across the pages, a brief description of the modern
+
+
+
+fertilizing process; spoke first, of course, of its surgical introduc-
+tion-"the operation undergone voluntarily for the good of Society, not
+to mention the fact that it carries a bonus amounting to six months'
+salary"; continued with some account of the technique for preserving
+the excised ovary alive and actively developing; passed on to a consid-
+eration of optimum temperature, salinity, viscosity; referred to the liq-
+uor in which the detached and ripened eggs were kept; and, leading
+his charges to the work tables, actually showed them how this liquor
+was drawn off from the test-tubes; how it was let out drop by drop
+onto the specially warmed slides of the microscopes; how the eggs
+which it contained were inspected for abnormalities, counted and
+transferred to a porous receptacle; how (and he now took them to
+watch the operation) this receptacle was immersed in a warm bouillon
+containing free-swimming spermatozoa-at a minimum concentration
+of one hundred thousand per cubic centimetre, he insisted; and how,
+after ten minutes, the container was lifted out of the liquor and its
+contents re-examined; how, if any of the eggs remained unfertilized, it
+was again immersed, and, if necessary, yet again; how the fertilized
+ova went back to the incubators; where the Alphas and Betas re-
+mained until definitely bottled; while the Gammas, Deltas and Epsilons
+were brought out again, after only thirty-six hours, to undergo Bo-
+kanovsky's Process.
+
+"Bokanovsky's Process," repeated the Director, and the students un-
+derlined the words in their little notebooks.
+
+One egg, one embryo, one adult-normality. But a bokanovskified egg
+will bud, will proliferate, will divide. From eight to ninety-six buds, and
+every bud will grow into a perfectly formed embryo, and every embryo
+into a full-sized adult. Making ninety-six human beings grow where
+only one grew before. Progress.
+
+"Essentially," the D.H.C. concluded, "bokanovskification consists of a
+series of arrests of development. We check the normal growth and,
+paradoxically enough, the egg responds by budding."
+
+Responds by budding. The pencils were busy.
+
+He pointed. On a very slowly moving band a rack-full of test-tubes was
+entering a large metal box, another, rack-full was emerging. Machinery
+faintly purred. It took eight minutes for the tubes to go through, he
+
+
+
+told them. Eight minutes of hard X-rays being about as much as an
+egg can stand. A few died; of the rest, the least susceptible divided
+into two; most put out four buds; some eight; all were returned to the
+incubators, where the buds began to develop; then, after two days,
+were suddenly chilled, chilled and checked. Two, four, eight, the buds
+in their turn budded; and having budded were dosed almost to death
+with alcohol; consequently burgeoned again and having budded-bud
+out of bud out of bud-were thereafter-further arrest being generally
+fatal-left to develop in peace. By which time the original egg was in a
+fair way to becoming anything from eight to ninety-six embryos- a
+prodigious improvement, you will agree, on nature. Identical twins-but
+not in piddling twos and threes as in the old viviparous days, when an
+egg would sometimes accidentally divide; actually by dozens, by
+scores at a time.
+
+"Scores," the Director repeated and flung out his arms, as though he
+were distributing largesse. "Scores."
+
+But one of the students was fool enough to ask where the advantage
+lay.
+
+"My good boy!" The Director wheeled sharply round on him. "Can't you
+see? Can't you see?" He raised a hand; his expression was solemn.
+"Bokanovsky's Process is one of the major instruments of social stabil-
+ity!"
+
+Major instruments of social stability.
+
+Standard men and women; in uniform batches. The whole of a small
+factory staffed with the products of a single bokanovskified egg.
+
+"Ninety-six identical twins working ninety-six identical machines!" The
+voice was almost tremulous with enthusiasm. "You really know where
+you are. For the first time in history." He quoted the planetary motto.
+"Community, Identity, Stability." Grand words. "If we could bo-
+kanovskify indefinitely the whole problem would be solved."
+
+Solved by standard Gammas, unvarying Deltas, uniform Epsilons. Mil-
+lions of identical twins. The principle of mass production at last applied
+to biology.
+
+
+
+"But, alas," the Director shook his head, "we can't bokanovskify indefi-
+nitely."
+
+Ninety-six seemed to be the limit; seventy-two a good average. From
+the same ovary and with gametes of the same male to manufacture as
+many batches of identical twins as possible-that was the best (sadly a
+second best) that they could do. And even that was difficult.
+
+"For in nature it takes thirty years for two hundred eggs to reach ma-
+turity. But our business is to stabilize the population at this moment,
+here and now. Dribbling out twins over a quarter of a century-what
+would be the use of that?"
+
+Obviously, no use at all. But Podsnap's Technique had immensely ac-
+celerated the process of ripening. They could make sure of at least a
+hundred and fifty mature eggs within two years. Fertilize and bo-
+kanovskify-in other words, multiply by seventy-two-and you get an
+average of nearly eleven thousand brothers and sisters in a hundred
+and fifty batches of identical twins, all within two years of the same
+age.
+
+"And in exceptional cases we can make one ovary yield us over fifteen
+thousand adult individuals."
+
+Beckoning to a fair-haired, ruddy young man who happened to be
+passing at the moment. "Mr. Foster," he called. The ruddy young man
+approached. "Can you tell us the record for a single ovary, Mr. Foster?"
+
+"Sixteen thousand and twelve in this Centre," Mr. Foster replied with-
+out hesitation. He spoke very quickly, had a vivacious blue eye, and
+took an evident pleasure in quoting figures. "Sixteen thousand and
+twelve; in one hundred and eighty-nine batches of identicals. But of
+course they've done much better," he rattled on, "in some of the tropi-
+cal Centres. Singapore has often produced over sixteen thousand five
+hundred; and Mombasa has actually touched the seventeen thousand
+mark. But then they have unfair advantages. You should see the way a
+negro ovary responds to pituitary! It's quite astonishing, when you're
+used to working with European material. Still," he added, with a laugh
+(but the light of combat was in his eyes and the lift of his chin was
+challenging), "still, we mean to beat them if we can. I'm working on a
+wonderful Delta-Minus ovary at this moment. Only just eighteen
+
+
+
+months old. Over twelve thousand seven hundred children already, ei-
+ther decanted or in embryo. And still going strong. We'll beat them
+yet."
+
+"That's the spirit I like!" cried the Director, and clapped Mr. Foster on
+the shoulder. "Come along with us, and give these boys the benefit of
+your expert knowledge."
+
+Mr. Foster smiled modestly. "With pleasure." They went.
+In the Bottling Room all was harmonious bustle and ordered activity.
+Flaps of fresh sow's peritoneum ready cut to the proper size came
+shooting up in little lifts from the Organ Store in the sub-basement.
+Whizz and then, click! the lift-hatches hew open; the bottle-liner had
+only to reach out a hand, take the flap, insert, smooth-down, and be-
+fore the lined bottle had had time to travel out of reach along the end-
+less band, whizz, click! another flap of peritoneum had shot up from
+the depths, ready to be slipped into yet another bottle, the next of that
+slow interminable procession on the band.
+
+Next to the Liners stood the Matriculators. The procession advanced;
+one by one the eggs were transferred from their test-tubes to the
+larger containers; deftly the peritoneal lining was slit, the morula
+dropped into place, the saline solution poured in ... and already the
+bottle had passed, and it was the turn of the labellers. Heredity, date
+of fertilization, membership of Bokanovsky Group-details were trans-
+ferred from test-tube to bottle. No longer anonymous, but named,
+identified, the procession marched slowly on; on through an opening in
+the wall, slowly on into the Social Predestination Room.
+"Eighty-eight cubic metres of card-index," said Mr. Foster with relish,
+as they entered."""
+
+
+def create_setup_and_compute(
+    model_names: List[str],
+    gpu: bool = True,
+    tensorflow: bool = False,
+    average_over: int = 3,
+    torchscript: bool = False,
+    xla: bool = False,
+    amp: bool = False,
+    fp16: bool = False,
+    save_to_csv: bool = False,
+    csv_filename: str = f"results_{round(time())}.csv",
+):
+    if xla:
+        tf.config.optimizer.set_jit(True)
+    if amp:
+        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
+
+    if tensorflow:
+        dictionary = {model_name: {} for model_name in model_names}
+        results = _compute_tensorflow(model_names, dictionary, average_over, amp)
+    else:
+        device = "cuda" if (gpu and torch.cuda.is_available()) else "cpu"
+        dictionary = {model_name: {} for model_name in model_names}
+        results = _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16)
+
+    print("=========== RESULTS ===========")
+    for model_name in model_names:
+        print("\t" + f"======= MODEL CHECKPOINT: {model_name} =======")
+        for batch_size in results[model_name]["bs"]:
+            print("\t\t" + f"===== BATCH SIZE: {batch_size} =====")
+            for slice_size in results[model_name]["ss"]:
+                result = results[model_name]["results"][batch_size][slice_size]
+                if isinstance(result, str):
+                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result}")
+                else:
+                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{(round(1000 * result) / 1000)}" f"s")
+
+    if save_to_csv:
+        with open(csv_filename, mode="w") as csv_file:
+            fieldnames = [
+                "model",
+                "1x8",
+                "1x64",
+                "1x128",
+                "1x256",
+                "1x512",
+                "1x1024",
+                "2x8",
+                "2x64",
+                "2x128",
+                "2x256",
+                "2x512",
+                "2x1024",
+                "4x8",
+                "4x64",
+                "4x128",
+                "4x256",
+                "4x512",
+                "4x1024",
+                "8x8",
+                "8x64",
+                "8x128",
+                "8x256",
+                "8x512",
+                "8x1024",
+            ]
+
+            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
+            writer.writeheader()
+
+            for model_name in model_names:
+                model_results = {
+                    f"{bs}x{ss}": results[model_name]["results"][bs][ss]
+                    for bs in results[model_name]["results"]
+                    for ss in results[model_name]["results"][bs]
+                }
+                writer.writerow({"model": model_name, **model_results})
+
+
+def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16):
+    for c, model_name in enumerate(model_names):
+        print(f"{c + 1} / {len(model_names)}")
+        config = AutoConfig.from_pretrained(model_name, torchscript=torchscript)
+        model = AutoModel.from_pretrained(model_name, config=config)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
+
+        max_input_size = tokenizer.max_model_input_sizes[model_name]
+        batch_sizes = [1, 2, 4, 8]
+        slice_sizes = [8, 64, 128, 256, 512, 1024]
+
+        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
+        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
+
+        for batch_size in batch_sizes:
+            if fp16:
+                model.half()
+            model.to(device)
+            model.eval()
+            for slice_size in slice_sizes:
+                if max_input_size is not None and slice_size > max_input_size:
+                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
+                else:
+                    sequence = torch.tensor(tokenized_sequence[:slice_size], device=device).repeat(batch_size, 1)
+                    try:
+                        if torchscript:
+                            print("Tracing model with sequence size", sequence.shape)
+                            inference = torch.jit.trace(model, sequence)
+                            inference(sequence)
+                        else:
+                            inference = model
+                            inference(sequence)
+
+                        print("Going through model with sequence of shape", sequence.shape)
+                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
+                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
+                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
+                    except RuntimeError as e:
+                        print("Doesn't fit on GPU.", e)
+                        torch.cuda.empty_cache()
+                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
+    return dictionary
+
+
+def _compute_tensorflow(model_names, dictionary, average_over, amp):
+    for c, model_name in enumerate(model_names):
+        print(f"{c + 1} / {len(model_names)}")
+        config = AutoConfig.from_pretrained(model_name)
+        model = TFAutoModel.from_pretrained(model_name, config=config)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
+
+        max_input_size = tokenizer.max_model_input_sizes[model_name]
+        batch_sizes = [1, 2, 4, 8]
+        slice_sizes = [8, 64, 128, 256, 512, 1024]
+
+        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
+        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
+
+        print("Using model", model)
+
+        @tf.function
+        def inference(inputs):
+            return model(inputs)
+
+        for batch_size in batch_sizes:
+            for slice_size in slice_sizes:
+                if max_input_size is not None and slice_size > max_input_size:
+                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
+                else:
+                    sequence = tf.stack(
+                        [tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size
+                    )
+
+                    try:
+                        print("Going through model with sequence of shape", sequence.shape)
+                        # To make sure that the model is traced + that the tensors are on the appropriate device
+                        inference(sequence)
+
+                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
+                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
+                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
+                    except tf.errors.ResourceExhaustedError as e:
+                        print("Doesn't fit on GPU.", e)
+                        torch.cuda.empty_cache()
+                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
+    return dictionary
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--models",
+        required=False,
+        type=str,
+        default="all",
+        help="Model checkpoints to be provided "
+        "to the AutoModel classes. Leave "
+        "blank to benchmark the base version "
+        "of all available model "
+        "architectures.",
+    )
+    parser.add_argument(
+        "--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the " "models"
+    )
+    parser.add_argument(
+        "--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available " "cuda devices"
+    )
+    parser.add_argument(
+        "--torchscript",
+        required=False,
+        action="store_true",
+        help="Pytorch only: trace the models " "using torchscript",
+    )
+    parser.add_argument(
+        "--tensorflow",
+        required=False,
+        action="store_true",
+        help="Benchmark the TensorFlow version "
+        "of the models. Will run on GPU if "
+        "the correct dependencies are "
+        "installed",
+    )
+    parser.add_argument("--xla", required=False, action="store_true", help="TensorFlow only: use XLA acceleration.")
+    parser.add_argument(
+        "--amp",
+        required=False,
+        action="store_true",
+        help="TensorFlow only: use automatic mixed precision acceleration.",
+    )
+    parser.add_argument(
+        "--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference."
+    )
+    parser.add_argument(
+        "--keras_predict",
+        required=False,
+        action="store_true",
+        help="Whether to use model.predict " "instead of model() to do a " "forward pass.",
+    )
+    parser.add_argument("--save_to_csv", required=False, action="store_true", help="Save to a CSV file.")
+    parser.add_argument(
+        "--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv."
+    )
+    parser.add_argument(
+        "--average_over", required=False, default=30, type=int, help="Times an experiment will be run."
+    )
+
+    args = parser.parse_args()
+    if args.models == "all":
+        args.models = [
+            "gpt2",
+            "bert-base-cased",
+            "xlnet-base-cased",
+            "xlm-mlm-en-2048",
+            "transfo-xl-wt103",
+            "openai-gpt",
+            "distilbert-base-uncased",
+            "distilgpt2",
+            "roberta-base",
+            "ctrl",
+        ]
+    else:
+        args.models = args.models.split()
+
+    print("Running with arguments", args)
+
+    if args.torch:
+        if is_torch_available():
+            create_setup_and_compute(
+                model_names=args.models,
+                tensorflow=False,
+                gpu=args.torch_cuda,
+                torchscript=args.torchscript,
+                fp16=args.fp16,
+                save_to_csv=args.save_to_csv,
+                csv_filename=args.csv_filename,
+                average_over=args.average_over,
+            )
+        else:
+            raise ImportError("Trying to run a PyTorch benchmark but PyTorch was not found in the environment.")
+
+    if args.tensorflow:
+        if is_tf_available():
+            create_setup_and_compute(
+                model_names=args.models,
+                tensorflow=True,
+                xla=args.xla,
+                amp=args.amp,
+                save_to_csv=args.save_to_csv,
+                csv_filename=args.csv_filename,
+                average_over=args.average_over,
+            )
+        else:
+            raise ImportError("Trying to run a TensorFlow benchmark but TensorFlow was not found in the environment.")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/contrib/run_camembert.py
+++ b/examples/contrib/run_camembert.py
@@ -0,0 +1,43 @@
+import torch
+
+from transformers.modeling_camembert import CamembertForMaskedLM
+from transformers.tokenization_camembert import CamembertTokenizer
+
+
+def fill_mask(masked_input, model, tokenizer, topk=5):
+    # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
+    assert masked_input.count("<mask>") == 1
+    input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+    logits = model(input_ids)[0]  # The last hidden-state is the first element of the output tuple
+    masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
+    logits = logits[0, masked_index, :]
+    prob = logits.softmax(dim=0)
+    values, indices = prob.topk(k=topk, dim=0)
+    topk_predicted_token_bpe = " ".join(
+        [tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))]
+    )
+    masked_token = tokenizer.mask_token
+    topk_filled_outputs = []
+    for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
+        predicted_token = predicted_token_bpe.replace("\u2581", " ")
+        if " {0}".format(masked_token) in masked_input:
+            topk_filled_outputs.append(
+                (
+                    masked_input.replace(" {0}".format(masked_token), predicted_token),
+                    values[index].item(),
+                    predicted_token,
+                )
+            )
+        else:
+            topk_filled_outputs.append(
+                (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
+            )
+    return topk_filled_outputs
+
+
+tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
+model = CamembertForMaskedLM.from_pretrained("camembert-base")
+model.eval()
+
+masked_input = "Le camembert est <mask> :)"
+print(fill_mask(masked_input, model, tokenizer, topk=3))
--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@@ -22,48 +22,54 @@
          --model_name openai-gpt \
          --do_train \
          --do_eval \
-          --train_dataset $ROC_STORIES_DIR/cloze_test_val__spring2016\ -\ cloze_test_ALL_val.csv \
-          --eval_dataset $ROC_STORIES_DIR/cloze_test_test__spring2016\ -\ cloze_test_ALL_test.csv \
+          --train_dataset "$ROC_STORIES_DIR/cloze_test_val__spring2016 - cloze_test_ALL_val.csv" \
+          --eval_dataset "$ROC_STORIES_DIR/cloze_test_test__spring2016 - cloze_test_ALL_test.csv" \
          --output_dir ../log \
          --train_batch_size 16 \
 """
 import argparse
-import os
 import csv
-import random
 import logging
-from tqdm import tqdm, trange
+import os
+import random

 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from tqdm import tqdm, trange

-from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
-                                     AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
-                                     WarmupLinearSchedule)
+from transformers import (
+    CONFIG_NAME,
+    WEIGHTS_NAME,
+    AdamW,
+    OpenAIGPTDoubleHeadsModel,
+    OpenAIGPTTokenizer,
+    get_linear_schedule_with_warmup,
+)

-ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"

-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
 logger = logging.getLogger(__name__)

+
 def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

+
 def load_rocstories_dataset(dataset_path):
    """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
-    with open(dataset_path, encoding='utf_8') as f:
+    with open(dataset_path, encoding="utf_8") as f:
        f = csv.reader(f)
        output = []
-        next(f) # skip the first line
+        next(f)  # skip the first line
        for line in tqdm(f):
-            output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
+            output.append((" ".join(line[1:5]), line[5], line[6], int(line[-1]) - 1))
    return output

+
 def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
    """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)

@@ -75,61 +81,73 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
        n_batch = len(dataset)
        input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
        mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
-        lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64)
+        lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
        mc_labels = np.zeros((n_batch,), dtype=np.int64)
        for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
            with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
            with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
-            input_ids[i, 0, :len(with_cont1)] = with_cont1
-            input_ids[i, 1, :len(with_cont2)] = with_cont2
+            input_ids[i, 0, : len(with_cont1)] = with_cont1
+            input_ids[i, 1, : len(with_cont2)] = with_cont2
            mc_token_ids[i, 0] = len(with_cont1) - 1
            mc_token_ids[i, 1] = len(with_cont2) - 1
-            lm_labels[i, 0, :len(with_cont1)] = with_cont1
-            lm_labels[i, 1, :len(with_cont2)] = with_cont2
+            lm_labels[i, 0, : len(with_cont1)] = with_cont1
+            lm_labels[i, 1, : len(with_cont2)] = with_cont2
            mc_labels[i] = mc_label
        all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
        tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
    return tensor_datasets

+
 def main():
    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name', type=str, default='openai-gpt',
-                        help='pretrained model name')
-    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
-    parser.add_argument('--train_dataset', type=str, default='')
-    parser.add_argument('--eval_dataset', type=str, default='')
-    parser.add_argument('--seed', type=int, default=42)
-    parser.add_argument('--num_train_epochs', type=int, default=3)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--eval_batch_size', type=int, default=16)
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument('--max_grad_norm', type=int, default=1)
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training \
-                        steps to perform. Override num_train_epochs.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before\
-                        performing a backward/update pass.")
-    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
-    parser.add_argument('--weight_decay', type=float, default=0.01)
-    parser.add_argument('--lm_coef', type=float, default=0.9)
-    parser.add_argument('--n_valid', type=int, default=374)
+    parser.add_argument("--model_name", type=str, default="openai-gpt", help="pretrained model name")
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--train_dataset", type=str, default="")
+    parser.add_argument("--eval_dataset", type=str, default="")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--num_train_epochs", type=int, default=3)
+    parser.add_argument("--train_batch_size", type=int, default=8)
+    parser.add_argument("--eval_batch_size", type=int, default=16)
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", type=int, default=1)
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training \
+                        steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before\
+                        performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", type=float, default=6.25e-5)
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument("--lr_schedule", type=str, default="warmup_linear")
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--lm_coef", type=float, default=0.9)
+    parser.add_argument("--n_valid", type=int, default=374)

-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -152,7 +170,7 @@ def main():
    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
-    special_tokens = ['_start_', '_delimiter_', '_classify_']
+    special_tokens = ["_start_", "_delimiter_", "_classify_"]
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
    tokenizer.add_tokens(special_tokens)
    special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
@@ -161,8 +179,6 @@ def main():
    model.to(device)

    # Load and encode the datasets
-    if not args.train_dataset and not args.eval_dataset:
-        roc_stories = cached_path(ROCSTORIES_URL)
    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
@@ -170,6 +186,7 @@ def main():
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)
+
    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
@@ -178,8 +195,11 @@ def main():

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
-    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
-                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
+    input_length = max(
+        len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3
+        for dataset in encoded_datasets
+        for story, cont1, cont2, _ in dataset
+    )
    input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
@@ -198,20 +218,23 @@ def main():
    if args.do_train:
        if args.max_steps > 0:
            t_total = args.max_steps
-            args.num_train_epochs = args.max_steps //\
-                (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+            args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
-            t_total = len(train_dataloader)\
-                // args.gradient_accumulation_steps * args.num_train_epochs
+            t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

        param_optimizer = list(model.named_parameters())
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
+            {
+                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+                "weight_decay": args.weight_decay,
+            },
+            {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+        )

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
@@ -230,14 +253,16 @@ def main():
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
-                exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
+                exp_average_loss = (
+                    loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
+                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0])

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        model_to_save = model.module if hasattr(model, "module") else model  # Only save the model itself

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
@@ -260,10 +285,12 @@ def main():
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
-               _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)
+                _, mc_loss, _, mc_logits = model(
+                    input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels
+                )

            mc_logits = mc_logits.detach().cpu().numpy()
-            mc_labels = mc_labels.to('cpu').numpy()
+            mc_labels = mc_labels.to("cpu").numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
@@ -274,10 +301,8 @@ def main():

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
-        train_loss = tr_loss/nb_tr_steps if args.do_train else None
-        result = {'eval_loss': eval_loss,
-                  'eval_accuracy': eval_accuracy,
-                  'train_loss': train_loss}
+        train_loss = tr_loss / nb_tr_steps if args.do_train else None
+        result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "train_loss": train_loss}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
@@ -286,5 +311,6 @@ def main():
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

-if __name__ == '__main__':
+
+if __name__ == "__main__":
    main()
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -16,50 +16,50 @@
 """BERT finetuning runner.
   Finetuning the library models for multiple choice on SWAG (Bert).
 """
-from __future__ import absolute_import, division, print_function
+

 import argparse
-import logging
 import csv
+import glob
+import logging
 import os
 import random
-import sys
-import glob

 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange

-from tensorboardX import SummaryWriter
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    BertConfig,
+    BertForMultipleChoice,
+    BertTokenizer,
+    get_linear_schedule_with_warmup,
+)

-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForMultipleChoice, BertTokenizer)

-from transformers import AdamW, WarmupLinearSchedule
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    from tensorboardX import SummaryWriter
+

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
-                  for conf in [BertConfig]), ())
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in [BertConfig]), ())

 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForMultipleChoice, BertTokenizer),
+    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
 }

+
 class SwagExample(object):
    """A single training/test example for the SWAG dataset."""
-    def __init__(self,
-                 swag_id,
-                 context_sentence,
-                 start_ending,
-                 ending_0,
-                 ending_1,
-                 ending_2,
-                 ending_3,
-                 label = None):
+
+    def __init__(self, swag_id, context_sentence, start_ending, ending_0, ending_1, ending_2, ending_3, label=None):
        self.swag_id = swag_id
        self.context_sentence = context_sentence
        self.start_ending = start_ending
@@ -75,7 +75,7 @@ class SwagExample(object):
        return self.__repr__()

    def __repr__(self):
-        l = [
+        attributes = [
            "swag_id: {}".format(self.swag_id),
            "context_sentence: {}".format(self.context_sentence),
            "start_ending: {}".format(self.start_ending),
@@ -86,61 +86,48 @@ class SwagExample(object):
        ]

        if self.label is not None:
-            l.append("label: {}".format(self.label))
+            attributes.append("label: {}".format(self.label))
+
+        return ", ".join(attributes)

-        return ", ".join(l)

 class InputFeatures(object):
-    def __init__(self,
-                 example_id,
-                 choices_features,
-                 label
-
-    ):
+    def __init__(self, example_id, choices_features, label):
        self.example_id = example_id
        self.choices_features = [
-            {
-                'input_ids': input_ids,
-                'input_mask': input_mask,
-                'segment_ids': segment_ids
-            }
+            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
            for _, input_ids, input_mask, segment_ids in choices_features
        ]
        self.label = label

-def read_swag_examples(input_file, is_training=True):
-    with open(input_file, 'r', encoding='utf-8') as f:
-        reader = csv.reader(f)
-        lines = []
-        for line in reader:
-            if sys.version_info[0] == 2:
-                line = list(unicode(cell, 'utf-8') for cell in line)
-            lines.append(line)

-    if is_training and lines[0][-1] != 'label':
-        raise ValueError(
-            "For training, the input file must contain a label column."
-        )
+def read_swag_examples(input_file, is_training=True):
+    with open(input_file, "r", encoding="utf-8") as f:
+        lines = list(csv.reader(f))
+
+    if is_training and lines[0][-1] != "label":
+        raise ValueError("For training, the input file must contain a label column.")

    examples = [
        SwagExample(
-            swag_id = line[2],
-            context_sentence = line[4],
-            start_ending = line[5], # in the swag dataset, the
-                                         # common beginning of each
-                                         # choice is stored in "sent2".
-            ending_0 = line[7],
-            ending_1 = line[8],
-            ending_2 = line[9],
-            ending_3 = line[10],
-            label = int(line[11]) if is_training else None
-        ) for line in lines[1:] # we skip the line with the column names
+            swag_id=line[2],
+            context_sentence=line[4],
+            start_ending=line[5],  # in the swag dataset, the
+            # common beginning of each
+            # choice is stored in "sent2".
+            ending_0=line[7],
+            ending_1=line[8],
+            ending_2=line[9],
+            ending_3=line[10],
+            label=int(line[11]) if is_training else None,
+        )
+        for line in lines[1:]  # we skip the line with the column names
    ]

    return examples

-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 is_training):
+
+def convert_examples_to_features(examples, tokenizer, max_seq_length, is_training):
    """Loads a data file into a list of `InputBatch`s."""

    # Swag is a multiple choice task. To perform this task using Bert,
@@ -200,23 +187,18 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
            logger.info("swag_id: {}".format(example.swag_id))
            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
                logger.info("choice: {}".format(choice_idx))
-                logger.info("tokens: {}".format(' '.join(tokens)))
-                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
-                logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
-                logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
+                logger.info("tokens: {}".format(" ".join(tokens)))
+                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
+                logger.info("input_mask: {}".format(" ".join(map(str, input_mask))))
+                logger.info("segment_ids: {}".format(" ".join(map(str, segment_ids))))
            if is_training:
                logger.info("label: {}".format(label))

-        features.append(
-            InputFeatures(
-                example_id = example.swag_id,
-                choices_features = choices_features,
-                label = label
-            )
-        )
+        features.append(InputFeatures(example_id=example.swag_id, choices_features=choices_features, label=label))

    return features

+
 def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

@@ -233,18 +215,14 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
        else:
            tokens_b.pop()

+
 def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

+
 def select_field(features, field):
-    return [
-        [
-            choice[field]
-            for choice in feature.choices_features
-        ]
-        for feature in features
-    ]
+    return [[choice[field] for choice in feature.choices_features] for feature in features]


 def set_seed(args):
@@ -254,24 +232,28 @@ def set_seed(args):
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

+
 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
-        'dev' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length)))
+    cached_features_file = os.path.join(
+        os.path.dirname(input_file),
+        "cached_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+        ),
+    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", input_file)
        examples = read_swag_examples(input_file)
-        features = convert_examples_to_features(
-            examples, tokenizer, args.max_seq_length, not evaluate)
+        features = convert_examples_to_features(examples, tokenizer, args.max_seq_length, not evaluate)

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -281,21 +263,21 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long)
-    all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long)
-    all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long)
+    all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
+    all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
+    all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
    all_label = torch.tensor([f.label for f in features], dtype=torch.long)

    if evaluate:
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_label)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
    else:
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                all_label)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)

    if output_examples:
        return dataset, examples, features
    return dataset
+
+
 def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
@@ -312,13 +294,18 @@ def train(args, train_dataset, model, tokenizer):
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
    if args.fp16:
        try:
            from apex import amp
@@ -332,17 +319,21 @@ def train(args, train_dataset, model, tokenizer):

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

@@ -350,17 +341,19 @@ def train(args, train_dataset, model, tokenizer):
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
-    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':       batch[0],
-                      'attention_mask':  batch[1],
-                      #'token_type_ids':  None if args.model_type == 'xlm' else batch[2],
-                      'token_type_ids': batch[2],
-                      'labels':         batch[3]}
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                # 'token_type_ids':  None if args.model_type == 'xlm' else batch[2],
+                "token_type_ids": batch[2],
+                "labels": batch[3],
+            }
            # if args.model_type in ['xlnet', 'xlm']:
            #     inputs.update({'cls_index': batch[5],
            #                    'p_mask':       batch[6]})
@@ -368,7 +361,7 @@ def train(args, train_dataset, model, tokenizer):
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

@@ -389,23 +382,27 @@ def train(args, train_dataset, model, tokenizer):

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_vocabulary(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
@@ -420,6 +417,7 @@ def train(args, train_dataset, model, tokenizer):

    return global_step, tr_loss / global_step

+
 def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)

@@ -436,7 +434,6 @@ def evaluate(args, model, tokenizer, prefix=""):
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

-
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

@@ -444,11 +441,13 @@ def evaluate(args, model, tokenizer, prefix=""):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      # 'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
-                      'token_type_ids': batch[2],
-                      'labels':         batch[3]}
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                # 'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+                "token_type_ids": batch[2],
+                "labels": batch[3],
+            }

            # if args.model_type in ['xlnet', 'xlm']:
            #     inputs.update({'cls_index': batch[4],
@@ -458,17 +457,16 @@ def evaluate(args, model, tokenizer, prefix=""):
            eval_loss += tmp_eval_loss.mean().item()

        logits = logits.detach().cpu().numpy()
-        label_ids = inputs['labels'].to('cpu').numpy()
+        label_ids = inputs["labels"].to("cpu").numpy()
        tmp_eval_accuracy = accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy

        nb_eval_steps += 1
-        nb_eval_examples += inputs['input_ids'].size(0)
+        nb_eval_examples += inputs["input_ids"].size(0)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_examples
-    result = {'eval_loss': eval_loss,
-              'eval_accuracy': eval_accuracy}
+    result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy}

    output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
@@ -479,92 +477,144 @@ def evaluate(args, model, tokenizer, prefix=""):

    return result

+
 def main():
    parser = argparse.ArgumentParser()

-    ## Required parameters
-    parser.add_argument("--train_file", default=None, type=str, required=True,
-                        help="SWAG csv for training. E.g., train.csv")
-    parser.add_argument("--predict_file", default=None, type=str, required=True,
-                        help="SWAG csv for predictions. E.g., val.csv or test.csv")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model checkpoints and predictions will be written.")
+    # Required parameters
+    parser.add_argument(
+        "--train_file", default=None, type=str, required=True, help="SWAG csv for training. E.g., train.csv"
+    )
+    parser.add_argument(
+        "--predict_file",
+        default=None,
+        type=str,
+        required=True,
+        help="SWAG csv for predictions. E.g., val.csv or test.csv",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model checkpoints and predictions will be written.",
+    )

-    ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences "
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    # Other parameters
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=384,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences "
+        "longer than this will be truncated, and sequences shorter than this will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )

-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
    args = parser.parse_args()

-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -576,16 +626,24 @@ def main():
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )

    # Set seed
    set_seed(args)
@@ -597,8 +655,12 @@ def main():
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
+    )

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -613,7 +675,6 @@ def main():
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

-
    # Save the trained model and the tokenizer
    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
        # Create output directory if needed
@@ -623,19 +684,20 @@ def main():
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

-
    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
@@ -646,14 +708,16 @@ def main():
            checkpoints = [args.model_name_or_path]

        if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs

        logger.info("Evaluate the following checkpoints: %s", checkpoints)

        for checkpoint in checkpoints:
            # Reload the model
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            tokenizer = tokenizer_class.from_pretrained(checkpoint)
            model.to(args.device)
@@ -661,7 +725,7 @@ def main():
            # Evaluate
            result = evaluate(args, model, tokenizer, prefix=global_step)

-            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
            results.update(result)

    logger.info("Results: {}".format(results))
--- a/examples/contrib/run_transfo_xl.py
+++ b/examples/contrib/run_transfo_xl.py
@@ -19,55 +19,48 @@

    This script with default values evaluates a pretrained Transformer-XL on WikiText 103
 """
-from __future__ import absolute_import, division, print_function, unicode_literals
+

 import argparse
 import logging
-import time
 import math
+import time

 import torch

-from transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
+from transformers import TransfoXLCorpus, TransfoXLLMHeadModel

-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
 logger = logging.getLogger(__name__)

+
 def main():
-    parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
-    parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
-                        help='pretrained model name')
-    parser.add_argument('--split', type=str, default='test',
-                        choices=['all', 'valid', 'test'],
-                        help='which split to evaluate')
-    parser.add_argument('--batch_size', type=int, default=10,
-                        help='batch size')
-    parser.add_argument('--tgt_len', type=int, default=128,
-                        help='number of tokens to predict')
-    parser.add_argument('--ext_len', type=int, default=0,
-                        help='length of the extended context')
-    parser.add_argument('--mem_len', type=int, default=1600,
-                        help='length of the retained previous heads')
-    parser.add_argument('--clamp_len', type=int, default=1000,
-                        help='max positional embedding index')
-    parser.add_argument('--no_cuda', action='store_true',
-                        help='Do not use CUDA even though CUA is available')
-    parser.add_argument('--work_dir', type=str, required=True,
-                        help='path to the work_dir')
-    parser.add_argument('--no_log', action='store_true',
-                        help='do not log the eval result')
-    parser.add_argument('--same_length', action='store_true',
-                        help='set same length attention with masking')
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser = argparse.ArgumentParser(description="PyTorch Transformer Language Model")
+    parser.add_argument("--model_name", type=str, default="transfo-xl-wt103", help="pretrained model name")
+    parser.add_argument(
+        "--split", type=str, default="test", choices=["all", "valid", "test"], help="which split to evaluate"
+    )
+    parser.add_argument("--batch_size", type=int, default=10, help="batch size")
+    parser.add_argument("--tgt_len", type=int, default=128, help="number of tokens to predict")
+    parser.add_argument("--ext_len", type=int, default=0, help="length of the extended context")
+    parser.add_argument("--mem_len", type=int, default=1600, help="length of the retained previous heads")
+    parser.add_argument("--clamp_len", type=int, default=1000, help="max positional embedding index")
+    parser.add_argument("--no_cuda", action="store_true", help="Do not use CUDA even though CUA is available")
+    parser.add_argument("--work_dir", type=str, required=True, help="path to the work_dir")
+    parser.add_argument("--no_log", action="store_true", help="do not log the eval result")
+    parser.add_argument("--same_length", action="store_true", help="set same length attention with masking")
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
    args = parser.parse_args()
-    assert args.ext_len >= 0, 'extended context length must be non-negative'
+    assert args.ext_len >= 0, "extended context length must be non-negative"

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -80,21 +73,20 @@ def main():
    # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
    # and tokenizing the dataset
    # The pre-processed corpus is a convertion (using the conversion script )
-    tokenizer = TransfoXLTokenizer.from_pretrained(args.model_name)
    corpus = TransfoXLCorpus.from_pretrained(args.model_name)
-    ntokens = len(corpus.vocab)

-    va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
-        device=device, ext_len=args.ext_len)
-    te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
-        device=device, ext_len=args.ext_len)
+    va_iter = corpus.get_iterator("valid", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
+    te_iter = corpus.get_iterator("test", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)

    # Load a pre-trained model
    model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
    model = model.to(device)

-    logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
-        args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
+    logger.info(
+        "Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}".format(
+            args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len
+        )
+    )

    model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
    if args.clamp_len > 0:
@@ -108,7 +100,7 @@ def main():
    def evaluate(eval_iter):
        # Turn on evaluation mode which disables dropout.
        model.eval()
-        total_len, total_loss = 0, 0.
+        total_len, total_loss = 0, 0.0
        start_time = time.time()
        with torch.no_grad():
            mems = None
@@ -119,35 +111,34 @@ def main():
                total_loss += seq_len * loss.item()
                total_len += seq_len
            total_time = time.time() - start_time
-        logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
-                total_time, 1000 * total_time / (idx+1)))
+        logger.info("Time : {:.2f}s, {:.2f}ms/segment".format(total_time, 1000 * total_time / (idx + 1)))
        return total_loss / total_len

    # Run on test data.
-    if args.split == 'all':
+    if args.split == "all":
        test_loss = evaluate(te_iter)
        valid_loss = evaluate(va_iter)
-    elif args.split == 'valid':
+    elif args.split == "valid":
        valid_loss = evaluate(va_iter)
        test_loss = None
-    elif args.split == 'test':
+    elif args.split == "test":
        test_loss = evaluate(te_iter)
        valid_loss = None

    def format_log(loss, split):
-        log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
-            split, loss, math.exp(loss))
+        log_str = "| {0} loss {1:5.2f} | {0} ppl {2:9.3f} ".format(split, loss, math.exp(loss))
        return log_str

-    log_str = ''
+    log_str = ""
    if valid_loss is not None:
-        log_str += format_log(valid_loss, 'valid')
+        log_str += format_log(valid_loss, "valid")
    if test_loss is not None:
-        log_str += format_log(test_loss, 'test')
+        log_str += format_log(test_loss, "test")

-    logger.info('=' * 100)
+    logger.info("=" * 100)
    logger.info(log_str)
-    logger.info('=' * 100)
+    logger.info("=" * 100)

-if __name__ == '__main__':
+
+if __name__ == "__main__":
    main()
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -1,53 +1,95 @@
 # Distil*

-This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT and DistilGPT2.
+This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT, DistilRoBERTa and DistilGPT2.

-**2019, October 3rd - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2.
+**January 20, 2020 - Bug fixing** We have recently discovered and fixed [a bug](https://github.com/huggingface/transformers/commit/48cbf267c988b56c71a2380f748a3e6092ccaed3) in the evaluation of our `run_*.py` scripts that caused the reported metrics to be over-estimated on average. We have updated all the metrics with the latest runs.
+
+**December 6, 2019 - Update** We release **DistilmBERT**: 92% of `bert-base-multilingual-cased` on XNLI. The model supports 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).
+
+**November 19, 2019 - Update** We release German **DistilBERT**: 98.8% of `bert-base-german-dbmdz-cased` on NER tasks.
+
+**October 23, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller.
+
+**October 3, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
+
+**September 19, 2019 - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 99% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!

-**2019, September 19th - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 97% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!

 ## What is Distil*

-Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
+Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 99% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.

-We have applied the same method to GPT2 and release the weights of the compressed model. On the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 15.0 compared to 18.5 for DistilGPT2 (after fine-tuning on the train set).
+We have applied the same method to other Transformer architectures and released the weights:
+- GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 16.3 compared to 21.1 for **DistilGPT2** (after fine-tuning on the train set).
+- RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base`'s performance on GLUE while being twice faster and 35% smaller.
+- German BERT: **German DistilBERT** reaches 99% of `bert-base-german-dbmdz-cased`'s performance on German NER (CoNLL-2003).
+- Multilingual BERT: **DistilmBERT** reaches 92% of Multilingual BERT's performance on XNLI while being twice faster and 25% smaller. The model supports 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages).

-For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108). The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances.
+For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108).

 Here are the results on the dev sets of GLUE:

-| Model      | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP  | RTE  | SST-2| STS-B| WNLI |
-| :---:      |    :---:    | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:|
-| BERT-base  |  **77.6**   | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 |
-| DistilBERT |  **76.8**   | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
+| Model                     | Macro-score                    | CoLA | MNLI | MRPC | QNLI | QQP  | RTE  | SST-2| STS-B| WNLI              |
+| :---:                     |    :---:                       | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:             |
+| BERT-base-uncased         |  **74.9**                      | 49.2 | 80.8 | 87.4 | 87.5 | 86.4 | 61.7 | 92.0 | 83.8 | 45.1              |
+| DistilBERT-base-uncased   |  **74.3**                      | 43.6 | 79.0 | 87.5 | 85.3 | 84.9 | 59.9 | 90.7 | 81.2 | 56.3              |
+| BERT-base-cased           |  **78.2**                      | 58.2 | 83.9 | 87.8 | 91.0 | 89.2 | 66.1 | 91.7 | 89.2 | 46.5              |
+| DistilBERT-base-cased     |  **75.9**                      | 47.2 | 81.5 | 85.6 | 88.2 | 87.8 | 60.6 | 90.4 | 85.5 | 56.3              |
+| ---                       |    ---                         |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  ---              |
+| RoBERTa-base (reported)   |  **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup>  |
+| DistilRoBERTa<sup>1</sup> |  **79.0**/**82.3**<sup>2</sup> | 59.3 | 84.0 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1              |
+
+<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa.
+
+<sup>2</sup> Macro-score computed without WNLI.
+
+<sup>3</sup> We compute this score ourselves for completeness.
+
+Here are the results on the *test* sets for 6 of the languages available in XNLI. The results are computed in the zero shot setting (trained on the English portion and evaluated on the target language portion):
+
+| Model                        | English | Spanish | Chinese | German | Arabic  | Urdu |
+| :---:                        | :---:   | :---:   | :---:   | :---:  | :---:   | :---:|
+| mBERT base cased (computed)  | 82.1    | 74.6    | 69.1    | 72.3   | 66.4    | 58.5 |
+| mBERT base uncased (reported)| 81.4    | 74.3    | 63.8    | 70.5   | 62.1    | 58.3 |
+| DistilmBERT                  | 78.2    | 69.1    | 64.0    | 66.3   | 59.1    | 54.7 |

 ## Setup

 This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`. 

-**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/transformers/issues/1179) for more details.
+**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0).
+

 ## How to use DistilBERT

-Transformers includes two pre-trained Distil* models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
+Transformers includes five pre-trained Distil* models, currently only provided for English and German (we are investigating the possibility to train and release a multilingual version of DistilBERT):

 - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset and . The model has 6 layers, 768 dimension and 12 heads, totalizing 82M (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
- and more to come! 🤗🤗🤗
+- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 79.8 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 82.3 F1 score).
+- `distilbert-base-cased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-cased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 65M parameters.
+- `distilbert-base-cased-distilled-squad`: A finetuned version of `distilbert-base-cased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 87.1 on the dev set (for comparison, Bert `bert-base-cased` version reaches a 88.7 F1 score).
+- `distilbert-base-german-cased`: DistilBERT German language model pretrained on 1/2 of the data used to pretrain Bert using distillation with the supervision of the `bert-base-german-dbmdz-cased` version of German DBMDZ Bert. For NER tasks the model reaches a F1 score of 83.49 on the CoNLL-2003 test set (for comparison, `bert-base-german-dbmdz-cased` reaches a 84.52 F1 score), and a F1 score of 85.23 on the GermEval 2014 test set (`bert-base-german-dbmdz-cased` reaches a 86.89 F1 score).
+- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
+- `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.
+- `distilbert-base-multilingual-cased`: DistilmBERT multilingual model pretrained with the supervision of `bert-base-multilingual-cased` on the concatenation of Wikipedia in 104 different languages. The model supports the 104 languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages). The model has 6 layers, 768 dimension and 12 heads, totalizing 134M parameters (compared to 177M parameters for mBERT-base). On average DistilmBERT is twice as fast as mBERT-base.

 Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.

 ```python
-tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-model = DistilBertModel.from_pretrained('distilbert-base-uncased')
+tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
+model = DistilBertModel.from_pretrained('distilbert-base-cased')

 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)
 outputs = model(input_ids)
 last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 ```

-Similarly, using DistilGPT2 simply consists in calling the GPT2 classes from a different pretrained checkpoint: `model = GPT2Model.from_pretrained('distilgpt2')`.
+Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint:
+- DistilBERT uncased: `model = DistilBertModel.from_pretrained('distilbert-base-uncased')`
+- DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')`
+- DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')`
+- DistilmBERT: `model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')`
+

 ## How to train Distil*

@@ -88,7 +130,7 @@ python train.py \
    --student_config training_configs/distilbert-base-uncased.json \
    --teacher_type bert \
    --teacher_name bert-base-uncased \
-    --alpha_ce 5.0 --alpha_mlm 2.0 --alpha_cos 1.0 --mlm \
+    --alpha_ce 5.0 --alpha_mlm 2.0 --alpha_cos 1.0 --alpha_clm 0.0 --mlm \
    --freeze_pos_embs \
    --dump_path serialization_dir/my_first_training \
    --data_file data/binarized_text.bert-base-uncased.pickle \
@@ -124,7 +166,7 @@ python -m torch.distributed.launch \
        --student_config training_configs/distilbert-base-uncased.json \
        --teacher_type bert \
        --teacher_name bert-base-uncased \
-        --alpha_ce 0.33 --alpha_mlm 0.33 --alpha_cos 0.33 --mlm \
+        --alpha_ce 0.33 --alpha_mlm 0.33 --alpha_cos 0.33 --alpha_clm 0.0 --mlm \
        --freeze_pos_embs \
        --dump_path serialization_dir/my_first_training \
        --data_file data/binarized_text.bert-base-uncased.pickle \
@@ -134,3 +176,16 @@ python -m torch.distributed.launch \
 **Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract.py` and `scripts/extract_distilbert.py` to create a valid initialization checkpoint and use `--student_pretrained_weights` argument to use this initialization for the distilled training!

 Happy distillation!
+
+## Citation
+
+If you find the resource useful, you should cite the following paper:
+
+```
+@inproceedings{sanh2019distilbert,
+  title={DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter},
+  author={Sanh, Victor and Debut, Lysandre and Chaumond, Julien and Wolf, Thomas},
+  booktitle={NeurIPS EMC^2 Workshop},
+  year={2019}
+}
+```
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -15,36 +15,36 @@
 """ The distiller to distil the student.
    Adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
-import os
 import math
-import psutil
+import os
 import time
-from tensorboardX import SummaryWriter
-from tqdm import trange, tqdm
-import numpy as np
-import psutil

+import psutil
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.optim import AdamW
+from torch.utils.data import BatchSampler, DataLoader, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
-from torch.utils.data import RandomSampler, BatchSampler, DataLoader
+from tqdm import tqdm

-from transformers import WarmupLinearSchedule
-
-from utils import logger
-from lm_seqs_dataset import LmSeqsDataset
 from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
+from lm_seqs_dataset import LmSeqsDataset
+from transformers import get_linear_schedule_with_warmup
+from utils import logger
+
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    from tensorboardX import SummaryWriter
+

 class Distiller:
-    def __init__(self,
-                 params: dict,
-                 dataset: LmSeqsDataset,
-                 token_probs: torch.tensor,
-                 student: nn.Module,
-                 teacher: nn.Module):
-        logger.info('Initializing Distiller')
+    def __init__(
+        self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module
+    ):
+        logger.info("Initializing Distiller")
        self.params = params
        self.dump_path = params.dump_path
        self.multi_gpu = params.multi_gpu
@@ -67,12 +67,10 @@ class Distiller:
        else:
            sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False)

-        self.dataloader = DataLoader(dataset=dataset,
-                                     batch_sampler=sampler,
-                                     collate_fn=dataset.batch_sequences)
+        self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences)

        self.temperature = params.temperature
-        assert self.temperature > 0.
+        assert self.temperature > 0.0

        self.alpha_ce = params.alpha_ce
        self.alpha_mlm = params.alpha_mlm
@@ -82,18 +80,18 @@ class Distiller:

        self.mlm = params.mlm
        if self.mlm:
-            logger.info(f'Using MLM loss for LM step.')
+            logger.info(f"Using MLM loss for LM step.")
            self.mlm_mask_prop = params.mlm_mask_prop
            assert 0.0 <= self.mlm_mask_prop <= 1.0
            assert params.word_mask + params.word_keep + params.word_rand == 1.0
            self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
-            self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs
-            self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs
+            self.pred_probs = self.pred_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else self.pred_probs
+            self.token_probs = token_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else token_probs
            if self.fp16:
                self.pred_probs = self.pred_probs.half()
                self.token_probs = self.token_probs.half()
        else:
-            logger.info(f'Using CLM loss for LM step.')
+            logger.info(f"Using CLM loss for LM step.")

        self.epoch = 0
        self.n_iter = 0
@@ -104,38 +102,54 @@ class Distiller:
        self.last_loss_ce = 0
        self.last_loss_mlm = 0
        self.last_loss_clm = 0
-        if self.alpha_mse > 0.: self.last_loss_mse = 0
-        if self.alpha_cos > 0.: self.last_loss_cos = 0
+        if self.alpha_mse > 0.0:
+            self.last_loss_mse = 0
+        if self.alpha_cos > 0.0:
+            self.last_loss_cos = 0
        self.last_log = 0

-        self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
-        self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
-        if self.alpha_mse > 0.:
-            self.mse_loss_fct = nn.MSELoss(reduction='sum')
-        if self.alpha_cos > 0.:
-            self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction='mean')
+        self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean")
+        self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+        if self.alpha_mse > 0.0:
+            self.mse_loss_fct = nn.MSELoss(reduction="sum")
+        if self.alpha_cos > 0.0:
+            self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction="mean")

-        logger.info('--- Initializing model optimizer')
+        logger.info("--- Initializing model optimizer")
        assert params.gradient_accumulation_steps >= 1
        self.num_steps_epoch = len(self.dataloader)
-        num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
+        num_train_optimization_steps = (
+            int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
+        )

-        no_decay = ['bias', 'LayerNorm.weight']
+        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
-            {'params': [p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': params.weight_decay},
-            {'params': [p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': 0.0}
+            {
+                "params": [
+                    p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad
+                ],
+                "weight_decay": params.weight_decay,
+            },
+            {
+                "params": [
+                    p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad
+                ],
+                "weight_decay": 0.0,
+            },
        ]
-        logger.info("------ Number of trainable parameters (student): %i" % sum([p.numel() for p in self.student.parameters() if p.requires_grad]))
+        logger.info(
+            "------ Number of trainable parameters (student): %i"
+            % sum([p.numel() for p in self.student.parameters() if p.requires_grad])
+        )
        logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()]))
-        self.optimizer = AdamW(optimizer_grouped_parameters,
-                               lr=params.learning_rate,
-                               eps=params.adam_epsilon,
-                               betas=(0.9, 0.98))
+        self.optimizer = AdamW(
+            optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)
+        )

        warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
-        self.scheduler = WarmupLinearSchedule(self.optimizer,
-                                                warmup_steps=warmup_steps,
-                                                t_total=num_train_optimization_steps)
+        self.scheduler = get_linear_schedule_with_warmup(
+            self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps
+        )

        if self.fp16:
            try:
@@ -143,33 +157,36 @@ class Distiller:
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            logger.info(f"Using fp16 training: {self.params.fp16_opt_level} level")
-            self.student, self.optimizer = amp.initialize(self.student,
-                                                          self.optimizer,
-                                                          opt_level=self.params.fp16_opt_level)
+            self.student, self.optimizer = amp.initialize(
+                self.student, self.optimizer, opt_level=self.params.fp16_opt_level
+            )
            self.teacher = self.teacher.half()

        if self.multi_gpu:
            if self.fp16:
                from apex.parallel import DistributedDataParallel
+
                logger.info("Using apex.parallel.DistributedDataParallel for distributed training.")
                self.student = DistributedDataParallel(self.student)
            else:
                from torch.nn.parallel import DistributedDataParallel
+
                logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
-                self.student = DistributedDataParallel(self.student,
-                                                       device_ids=[params.local_rank],
-                                                       output_device=params.local_rank,
-                                                       find_unused_parameters=True)
+                self.student = DistributedDataParallel(
+                    self.student,
+                    device_ids=[params.local_rank],
+                    output_device=params.local_rank,
+                    find_unused_parameters=True,
+                )

        self.is_master = params.is_master
        if self.is_master:
-            logger.info('--- Initializing Tensorboard')
-            self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, 'log', 'train'))
-            self.tensorboard.add_text(tag='config/training', text_string=str(self.params), global_step=0)
-            self.tensorboard.add_text(tag='config/student', text_string=str(self.student_config), global_step=0)
+            logger.info("--- Initializing Tensorboard")
+            self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, "log", "train"))
+            self.tensorboard.add_text(tag="config/training", text_string=str(self.params), global_step=0)
+            self.tensorboard.add_text(tag="config/student", text_string=str(self.student_config), global_step=0)

-    def prepare_batch_mlm(self,
-                          batch):
+    def prepare_batch_mlm(self, batch):
        """
        Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM.

@@ -183,13 +200,13 @@ class Distiller:
        -------
            token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
            attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
-            mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -1 where there is nothing to predict.
+            mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -100 where there is nothing to predict.
        """
        token_ids, lengths = batch
        token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
        assert token_ids.size(0) == lengths.size(0)

-        attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
+        attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]

        bs, max_seq_len = token_ids.size()
        mlm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
@@ -197,11 +214,13 @@ class Distiller:
        x_prob = self.token_probs[token_ids.flatten()]
        n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item())
        tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False)
-        pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.bool, device=token_ids.device) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
+        pred_mask = torch.zeros(
+            bs * max_seq_len, dtype=torch.bool, device=token_ids.device
+        )  # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
        pred_mask[tgt_ids] = 1
        pred_mask = pred_mask.view(bs, max_seq_len)

-        pred_mask[token_ids == self.params.special_tok_ids['pad_token']] = 0
+        pred_mask[token_ids == self.params.special_tok_ids["pad_token"]] = 0

        # mask a number of words == 0 [8] (faster with fp16)
        if self.fp16:
@@ -210,26 +229,29 @@ class Distiller:
                pred_mask = pred_mask.view(-1)
                n2 = max(n1 % 8, 8 * (n1 // 8))
                if n2 != n1:
-                    pred_mask[torch.nonzero(pred_mask).view(-1)[:n1-n2]] = 0
+                    pred_mask[torch.nonzero(pred_mask).view(-1)[: n1 - n2]] = 0
                pred_mask = pred_mask.view(bs, max_seq_len)
                assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item()

        _token_ids_real = token_ids[pred_mask]
        _token_ids_rand = _token_ids_real.clone().random_(self.vocab_size)
-        _token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids['mask_token'])
+        _token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids["mask_token"])
        probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True)
-        _token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
+        _token_ids = (
+            _token_ids_mask * (probs == 0).long()
+            + _token_ids_real * (probs == 1).long()
+            + _token_ids_rand * (probs == 2).long()
+        )
        token_ids = token_ids.masked_scatter(pred_mask, _token_ids)

-        mlm_labels[~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
+        mlm_labels[~pred_mask] = -100  # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility

        # sanity checks
        assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size

        return token_ids, attn_mask, mlm_labels

-    def prepare_batch_clm(self,
-                          batch):
+    def prepare_batch_clm(self, batch):
        """
        Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM.

@@ -243,24 +265,22 @@ class Distiller:
        -------
            token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
            attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
-            clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -1 where there is nothing to predict.
+            clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -100 where there is nothing to predict.
        """
        token_ids, lengths = batch
        token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
        assert token_ids.size(0) == lengths.size(0)

-        attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
+        attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
        clm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
-        clm_labels[~attn_mask] = -1 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
+        clm_labels[~attn_mask] = -100  # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility

        # sanity checks
        assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size

        return token_ids, attn_mask, clm_labels

-    def round_batch(self,
-                    x: torch.tensor,
-                    lengths: torch.tensor):
+    def round_batch(self, x: torch.tensor, lengths: torch.tensor):
        """
        For float16 only.
        Sub-sample sentences in a batch, and add padding, so that each dimension is a multiple of 8.
@@ -296,9 +316,9 @@ class Distiller:
            pad = 8 - (ml1 % 8)
            ml2 = ml1 + pad
            if self.mlm:
-                pad_id = self.params.special_tok_ids['pad_token']
+                pad_id = self.params.special_tok_ids["pad_token"]
            else:
-                pad_id = self.params.special_tok_ids['unk_token']
+                pad_id = self.params.special_tok_ids["unk_token"]
            padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id)
            x = torch.cat([x, padding_tensor], 1)
            assert x.size() == (bs2, ml2)
@@ -311,20 +331,22 @@ class Distiller:
        """
        The real training loop.
        """
-        if self.is_master: logger.info('Starting training')
+        if self.is_master:
+            logger.info("Starting training")
        self.last_log = time.time()
        self.student.train()
        self.teacher.eval()

        for _ in range(self.params.n_epoch):
-            if self.is_master: logger.info(f'--- Starting epoch {self.epoch}/{self.params.n_epoch-1}')
+            if self.is_master:
+                logger.info(f"--- Starting epoch {self.epoch}/{self.params.n_epoch-1}")
            if self.multi_gpu:
                torch.distributed.barrier()

            iter_bar = tqdm(self.dataloader, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
            for batch in iter_bar:
                if self.params.n_gpu > 0:
-                    batch = tuple(t.to(f'cuda:{self.params.local_rank}') for t in batch)
+                    batch = tuple(t.to(f"cuda:{self.params.local_rank}") for t in batch)

                if self.mlm:
                    token_ids, attn_mask, lm_labels = self.prepare_batch_mlm(batch=batch)
@@ -333,22 +355,21 @@ class Distiller:
                self.step(input_ids=token_ids, attention_mask=attn_mask, lm_labels=lm_labels)

                iter_bar.update()
-                iter_bar.set_postfix({'Last_loss': f'{self.last_loss:.2f}',
-                                      'Avg_cum_loss': f'{self.total_loss_epoch/self.n_iter:.2f}'})
+                iter_bar.set_postfix(
+                    {"Last_loss": f"{self.last_loss:.2f}", "Avg_cum_loss": f"{self.total_loss_epoch/self.n_iter:.2f}"}
+                )
            iter_bar.close()

-            if self.is_master: logger.info(f'--- Ending epoch {self.epoch}/{self.params.n_epoch-1}')
+            if self.is_master:
+                logger.info(f"--- Ending epoch {self.epoch}/{self.params.n_epoch-1}")
            self.end_epoch()

        if self.is_master:
-            logger.info(f'Save very last checkpoint as `pytorch_model.bin`.')
-            self.save_checkpoint(checkpoint_name=f'pytorch_model.bin')
-            logger.info('Training is finished')
+            logger.info(f"Save very last checkpoint as `pytorch_model.bin`.")
+            self.save_checkpoint(checkpoint_name=f"pytorch_model.bin")
+            logger.info("Training is finished")

-    def step(self,
-             input_ids: torch.tensor,
-             attention_mask: torch.tensor,
-             lm_labels: torch.tensor):
+    def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor):
        """
        One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation),
        and possibly a parameter update (depending on the gradient accumulation).
@@ -360,78 +381,91 @@ class Distiller:
        lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM).
        """
        if self.mlm:
-            s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask)     # (bs, seq_length, voc_size)
+            s_logits, s_hidden_states = self.student(
+                input_ids=input_ids, attention_mask=attention_mask
+            )  # (bs, seq_length, voc_size)
            with torch.no_grad():
-                t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
+                t_logits, t_hidden_states = self.teacher(
+                    input_ids=input_ids, attention_mask=attention_mask
+                )  # (bs, seq_length, voc_size)
        else:
-            s_logits, _, s_hidden_states = self.student(input_ids=input_ids, attention_mask=None)            # (bs, seq_length, voc_size)
+            s_logits, _, s_hidden_states = self.student(
+                input_ids=input_ids, attention_mask=None
+            )  # (bs, seq_length, voc_size)
            with torch.no_grad():
-                t_logits, _, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=None)           # (bs, seq_length, voc_size)
+                t_logits, _, t_hidden_states = self.teacher(
+                    input_ids=input_ids, attention_mask=None
+                )  # (bs, seq_length, voc_size)
        assert s_logits.size() == t_logits.size()

-        #https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
-        #https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
+        # https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
+        # https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
        if self.params.restrict_ce_to_mask:
-            mask = (lm_labels>-1).unsqueeze(-1).expand_as(s_logits)    # (bs, seq_lenth, voc_size)
+            mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
        else:
-            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)    # (bs, seq_lenth, voc_size)
-        s_logits_slct = torch.masked_select(s_logits, mask)            # (bs * seq_length * voc_size) modulo the 1s in mask
-        s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1))      # (bs * seq_length, voc_size) modulo the 1s in mask
-        t_logits_slct = torch.masked_select(t_logits, mask)            # (bs * seq_length * voc_size) modulo the 1s in mask
-        t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1))      # (bs * seq_length, voc_size) modulo the 1s in mask
+            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
+        s_logits_slct = torch.masked_select(s_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
+        s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
+        t_logits_slct = torch.masked_select(t_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
+        t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
        assert t_logits_slct.size() == s_logits_slct.size()

-        loss_ce = self.ce_loss_fct(F.log_softmax(s_logits_slct/self.temperature, dim=-1),
-                                   F.softmax(t_logits_slct/self.temperature, dim=-1)) * (self.temperature)**2
-        loss = self.alpha_ce*loss_ce
+        loss_ce = (
+            self.ce_loss_fct(
+                F.log_softmax(s_logits_slct / self.temperature, dim=-1),
+                F.softmax(t_logits_slct / self.temperature, dim=-1),
+            )
+            * (self.temperature) ** 2
+        )
+        loss = self.alpha_ce * loss_ce

-        if self.alpha_mlm > 0.:
+        if self.alpha_mlm > 0.0:
            loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1))
            loss += self.alpha_mlm * loss_mlm
-        if self.alpha_clm > 0.:
+        if self.alpha_clm > 0.0:
            shift_logits = s_logits[..., :-1, :].contiguous()
            shift_labels = lm_labels[..., 1:].contiguous()
-            loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                                        shift_labels.view(-1))
+            loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            loss += self.alpha_clm * loss_clm

-        if self.alpha_mse > 0.:
-            loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction
+        if self.alpha_mse > 0.0:
+            loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct) / s_logits_slct.size(
+                0
+            )  # Reproducing batchmean reduction
            loss += self.alpha_mse * loss_mse
-        if self.alpha_cos > 0.:
-            s_hidden_states = s_hidden_states[-1]                              # (bs, seq_length, dim)
-            t_hidden_states = t_hidden_states[-1]                              # (bs, seq_length, dim)
-            mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states)     # (bs, seq_length, dim)
+        if self.alpha_cos > 0.0:
+            s_hidden_states = s_hidden_states[-1]  # (bs, seq_length, dim)
+            t_hidden_states = t_hidden_states[-1]  # (bs, seq_length, dim)
+            mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states)  # (bs, seq_length, dim)
            assert s_hidden_states.size() == t_hidden_states.size()
            dim = s_hidden_states.size(-1)
-            
-            s_hidden_states_slct = torch.masked_select(s_hidden_states, mask)        # (bs * seq_length * dim)
-            s_hidden_states_slct = s_hidden_states_slct.view(-1, dim)                # (bs * seq_length, dim)
-            t_hidden_states_slct = torch.masked_select(t_hidden_states, mask)        # (bs * seq_length * dim)
-            t_hidden_states_slct = t_hidden_states_slct.view(-1, dim)                # (bs * seq_length, dim)
-        
-            target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,)
+
+            s_hidden_states_slct = torch.masked_select(s_hidden_states, mask)  # (bs * seq_length * dim)
+            s_hidden_states_slct = s_hidden_states_slct.view(-1, dim)  # (bs * seq_length, dim)
+            t_hidden_states_slct = torch.masked_select(t_hidden_states, mask)  # (bs * seq_length * dim)
+            t_hidden_states_slct = t_hidden_states_slct.view(-1, dim)  # (bs * seq_length, dim)
+
+            target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1)  # (bs * seq_length,)
            loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target)
            loss += self.alpha_cos * loss_cos

        self.total_loss_epoch += loss.item()
        self.last_loss = loss.item()
        self.last_loss_ce = loss_ce.item()
-        if self.alpha_mlm > 0.:
+        if self.alpha_mlm > 0.0:
            self.last_loss_mlm = loss_mlm.item()
-        if self.alpha_clm > 0.:
+        if self.alpha_clm > 0.0:
            self.last_loss_clm = loss_clm.item()
-        if self.alpha_mse > 0.:
+        if self.alpha_mse > 0.0:
            self.last_loss_mse = loss_mse.item()
-        if self.alpha_cos > 0.:
+        if self.alpha_cos > 0.0:
            self.last_loss_cos = loss_cos.item()

        self.optimize(loss)

        self.n_sequences_epoch += input_ids.size(0)

-    def optimize(self,
-                 loss):
+    def optimize(self, loss):
        """
        Normalization on the loss (gradient accumulation or distributed training), followed by
        backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
@@ -439,7 +473,7 @@ class Distiller:
        """
        # Check for NaN
        if (loss != loss).data.any():
-            logger.error('NaN detected')
+            logger.error("NaN detected")
            exit()

        if self.multi_gpu:
@@ -449,6 +483,7 @@ class Distiller:

        if self.fp16:
            from apex import amp
+
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
@@ -485,53 +520,84 @@ class Distiller:
            return

        for param_name, param in self.student.named_parameters():
-            self.tensorboard.add_scalar(tag='parameter_mean/' + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter)
-            self.tensorboard.add_scalar(tag='parameter_std/' + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter)
+            self.tensorboard.add_scalar(
+                tag="parameter_mean/" + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter
+            )
+            self.tensorboard.add_scalar(
+                tag="parameter_std/" + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter
+            )
            if param.grad is None:
                continue
-            self.tensorboard.add_scalar(tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(),global_step=self.n_total_iter)
-            self.tensorboard.add_scalar(tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter)
+            self.tensorboard.add_scalar(
+                tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(), global_step=self.n_total_iter
+            )
+            self.tensorboard.add_scalar(
+                tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter
+            )

-        self.tensorboard.add_scalar(tag="losses/cum_avg_loss_epoch", scalar_value=self.total_loss_epoch/self.n_iter, global_step=self.n_total_iter)
+        self.tensorboard.add_scalar(
+            tag="losses/cum_avg_loss_epoch",
+            scalar_value=self.total_loss_epoch / self.n_iter,
+            global_step=self.n_total_iter,
+        )
        self.tensorboard.add_scalar(tag="losses/loss", scalar_value=self.last_loss, global_step=self.n_total_iter)
-        self.tensorboard.add_scalar(tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter)
-        if self.alpha_mlm > 0.:
-            self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter)
-        if self.alpha_clm > 0.:
-            self.tensorboard.add_scalar(tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter)
-        if self.alpha_mse > 0.:
-            self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
-        if self.alpha_cos > 0.:
-            self.tensorboard.add_scalar(tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter)
-        self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter)
-        
-        self.tensorboard.add_scalar(tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used']/1_000_000, global_step=self.n_total_iter)
-        self.tensorboard.add_scalar(tag="global/speed", scalar_value=time.time()-self.last_log, global_step=self.n_total_iter)
+        self.tensorboard.add_scalar(
+            tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter
+        )
+        if self.alpha_mlm > 0.0:
+            self.tensorboard.add_scalar(
+                tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter
+            )
+        if self.alpha_clm > 0.0:
+            self.tensorboard.add_scalar(
+                tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter
+            )
+        if self.alpha_mse > 0.0:
+            self.tensorboard.add_scalar(
+                tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter
+            )
+        if self.alpha_cos > 0.0:
+            self.tensorboard.add_scalar(
+                tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter
+            )
+        self.tensorboard.add_scalar(
+            tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter
+        )
+
+        self.tensorboard.add_scalar(
+            tag="global/memory_usage",
+            scalar_value=psutil.virtual_memory()._asdict()["used"] / 1_000_000,
+            global_step=self.n_total_iter,
+        )
+        self.tensorboard.add_scalar(
+            tag="global/speed", scalar_value=time.time() - self.last_log, global_step=self.n_total_iter
+        )

    def end_epoch(self):
        """
        Finally arrived at the end of epoch (full pass on dataset).
        Do some tensorboard logging and checkpoint saving.
        """
-        logger.info(f'{self.n_sequences_epoch} sequences have been trained during this epoch.')
+        logger.info(f"{self.n_sequences_epoch} sequences have been trained during this epoch.")

        if self.is_master:
-            self.save_checkpoint(checkpoint_name=f'model_epoch_{self.epoch}.pth')
-            self.tensorboard.add_scalar(tag='epoch/loss', scalar_value=self.total_loss_epoch/self.n_iter, global_step=self.epoch)
+            self.save_checkpoint(checkpoint_name=f"model_epoch_{self.epoch}.pth")
+            self.tensorboard.add_scalar(
+                tag="epoch/loss", scalar_value=self.total_loss_epoch / self.n_iter, global_step=self.epoch
+            )

        self.epoch += 1
        self.n_sequences_epoch = 0
        self.n_iter = 0
        self.total_loss_epoch = 0

-    def save_checkpoint(self,
-                        checkpoint_name: str = 'checkpoint.pth'):
+    def save_checkpoint(self, checkpoint_name: str = "checkpoint.pth"):
        """
        Save the current state. Only by the master process.
        """
        if not self.is_master:
            return
-        mdl_to_save = self.student.module if hasattr(self.student, 'module') else self.student
+        mdl_to_save = self.student.module if hasattr(self.student, "module") else self.student
        mdl_to_save.config.save_pretrained(self.dump_path)
        state_dict = mdl_to_save.state_dict()
        torch.save(state_dict, os.path.join(self.dump_path, checkpoint_name))
--- a/examples/distillation/grouped_batch_sampler.py
+++ b/examples/distillation/grouped_batch_sampler.py
@@ -17,18 +17,20 @@
 import bisect
 import copy
 from collections import defaultdict
-import numpy as np

+import numpy as np
 from torch.utils.data.sampler import BatchSampler, Sampler

 from utils import logger

+
 def _quantize(x, bins):
    bins = copy.deepcopy(bins)
    bins = sorted(bins)
    quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
    return quantized

+
 def create_lengths_groups(lengths, k=0):
    bins = np.arange(start=3, stop=k, step=4).tolist() if k > 0 else [10]
    groups = _quantize(lengths, bins)
@@ -39,6 +41,7 @@ def create_lengths_groups(lengths, k=0):
    logger.info("Count of instances per bin: {}".format(counts))
    return groups

+
 class GroupedBatchSampler(BatchSampler):
    """
    Wraps another sampler to yield a mini-batch of indices.
@@ -53,11 +56,11 @@ class GroupedBatchSampler(BatchSampler):
            0, i.e. they must be in the range [0, num_groups).
        batch_size (int): Size of mini-batch.
    """
+
    def __init__(self, sampler, group_ids, batch_size):
        if not isinstance(sampler, Sampler):
            raise ValueError(
-                "sampler should be an instance of "
-                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
+                "sampler should be an instance of " "torch.utils.data.Sampler, but got sampler={}".format(sampler)
            )
        self.sampler = sampler
        self.group_ids = group_ids
@@ -73,7 +76,7 @@ class GroupedBatchSampler(BatchSampler):
            buffer_per_group[group_id].append(idx)
            samples_per_group[group_id].append(idx)
            if len(buffer_per_group[group_id]) == self.batch_size:
-                yield buffer_per_group[group_id] #TODO
+                yield buffer_per_group[group_id]  # TODO
                num_batches += 1
                del buffer_per_group[group_id]
            assert len(buffer_per_group[group_id]) < self.batch_size
@@ -90,8 +93,8 @@ class GroupedBatchSampler(BatchSampler):
            for group_id, idxs in sorted(buffer_per_group.items(), key=lambda x: x[0]):
                batch_idx.extend(idxs)
                if len(batch_idx) >= self.batch_size:
-                    yield batch_idx[:self.batch_size]
-                    batch_idx = batch_idx[self.batch_size:]
+                    yield batch_idx[: self.batch_size]
+                    batch_idx = batch_idx[self.batch_size :]
                    num_remaining -= 1
            if len(batch_idx) > 0:
                yield batch_idx
--- a/examples/distillation/lm_seqs_dataset.py
+++ b/examples/distillation/lm_seqs_dataset.py
@@ -15,12 +15,13 @@
 """ Dataset to distilled models
    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
+import numpy as np
 import torch
 from torch.utils.data import Dataset

-import numpy as np
 from utils import logger

+
 class LmSeqsDataset(Dataset):
    """Custom Dataset wrapping language modeling sequences.

@@ -32,9 +33,7 @@ class LmSeqsDataset(Dataset):
        data: `List[np.array[int]]
    """

-    def __init__(self,
-                 params,
-                 data):
+    def __init__(self, params, data):
        self.params = params

        self.token_ids = np.array(data)
@@ -43,6 +42,7 @@ class LmSeqsDataset(Dataset):
        self.check()
        self.remove_long_sequences()
        self.remove_empty_sequences()
+        self.remove_unknown_sequences()
        self.check()
        self.print_statistics()

@@ -57,7 +57,7 @@ class LmSeqsDataset(Dataset):
        Some sanity checks
        """
        assert len(self.token_ids) == len(self.lengths)
-        assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths))) 
+        assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths)))

    def remove_long_sequences(self):
        """
@@ -65,17 +65,17 @@ class LmSeqsDataset(Dataset):
        """
        max_len = self.params.max_model_input_size
        indices = self.lengths > max_len
-        logger.info(f'Splitting {sum(indices)} too long sequences.')
+        logger.info(f"Splitting {sum(indices)} too long sequences.")

        def divide_chunks(l, n):
-            return [l[i:i + n] for i in range(0, len(l), n)]
+            return [l[i : i + n] for i in range(0, len(l), n)]

        new_tok_ids = []
        new_lengths = []
        if self.params.mlm:
-            cls_id, sep_id = self.params.special_tok_ids['cls_token'], self.params.special_tok_ids['sep_token']
+            cls_id, sep_id = self.params.special_tok_ids["cls_token"], self.params.special_tok_ids["sep_token"]
        else:
-            cls_id, sep_id = self.params.special_tok_ids['bos_token'], self.params.special_tok_ids['eos_token']
+            cls_id, sep_id = self.params.special_tok_ids["bos_token"], self.params.special_tok_ids["eos_token"]

        for seq_, len_ in zip(self.token_ids, self.lengths):
            assert (seq_[0] == cls_id) and (seq_[-1] == sep_id), seq_
@@ -84,7 +84,7 @@ class LmSeqsDataset(Dataset):
                new_lengths.append(len_)
            else:
                sub_seqs = []
-                for sub_s in divide_chunks(seq_, max_len-2):
+                for sub_s in divide_chunks(seq_, max_len - 2):
                    if sub_s[0] != cls_id:
                        sub_s = np.insert(sub_s, 0, cls_id)
                    if sub_s[-1] != sep_id:
@@ -108,7 +108,23 @@ class LmSeqsDataset(Dataset):
        self.token_ids = self.token_ids[indices]
        self.lengths = self.lengths[indices]
        new_size = len(self)
-        logger.info(f'Remove {init_size - new_size} too short (<=11 tokens) sequences.')
+        logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
+
+    def remove_unknown_sequences(self):
+        """
+        Remove sequences with a (too) high level of unknown tokens.
+        """
+        if "unk_token" not in self.params.special_tok_ids:
+            return
+        else:
+            unk_token_id = self.params.special_tok_ids["unk_token"]
+        init_size = len(self)
+        unk_occs = np.array([np.count_nonzero(a == unk_token_id) for a in self.token_ids])
+        indices = (unk_occs / self.lengths) < 0.5
+        self.token_ids = self.token_ids[indices]
+        self.lengths = self.lengths[indices]
+        new_size = len(self)
+        logger.info(f"Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).")

    def print_statistics(self):
        """
@@ -116,7 +132,7 @@ class LmSeqsDataset(Dataset):
        """
        if not self.params.is_master:
            return
-        logger.info(f'{len(self)} sequences')
+        logger.info(f"{len(self)} sequences")
        # data_len = sum(self.lengths)
        # nb_unique_tokens = len(Counter(list(chain(*self.token_ids))))
        # logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)')
@@ -125,8 +141,7 @@ class LmSeqsDataset(Dataset):
        # nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
        # logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')

-    def batch_sequences(self,
-                        batch):
+    def batch_sequences(self, batch):
        """
        Do the padding and transform into torch.tensor.
        """
@@ -139,13 +154,13 @@ class LmSeqsDataset(Dataset):

        # Pad token ids
        if self.params.mlm:
-            pad_idx = self.params.special_tok_ids['pad_token']
+            pad_idx = self.params.special_tok_ids["pad_token"]
        else:
-            pad_idx = self.params.special_tok_ids['unk_token']
-        tk_ = [list(t.astype(int)) + [pad_idx]*(max_seq_len_-len(t)) for t in token_ids]
+            pad_idx = self.params.special_tok_ids["unk_token"]
+        tk_ = [list(t.astype(int)) + [pad_idx] * (max_seq_len_ - len(t)) for t in token_ids]
        assert len(tk_) == len(token_ids)
        assert all(len(t) == max_seq_len_ for t in tk_)

-        tk_t = torch.tensor(tk_)      # (bs, max_seq_len_)
+        tk_t = torch.tensor(tk_)  # (bs, max_seq_len_)
        lg_t = torch.tensor(lengths)  # (bs)
        return tk_t, lg_t
--- a/examples/distillation/requirements.txt
+++ b/examples/distillation/requirements.txt
@@ -1,6 +1,7 @@
+transformers
+
 gitpython==3.0.2
 tensorboard>=1.14.0
 tensorboardX==1.8
 psutil==5.6.3
 scipy==1.3.1
-transformers==2.0.0
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -16,75 +16,79 @@
 Preprocessing script before distillation.
 """
 import argparse
+import logging
 import pickle
 import random
 import time
-import numpy as np
-from transformers import BertTokenizer, RobertaTokenizer, GPT2Tokenizer
-import logging

-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+import numpy as np
+
+from transformers import BertTokenizer, GPT2Tokenizer, RobertaTokenizer
+
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
 logger = logging.getLogger(__name__)

+
 def main():
-    parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
-    parser.add_argument('--file_path', type=str, default='data/dump.txt',
-                        help='The path to the data.')
-    parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2'])
-    parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
-                        help="The tokenizer to use.")
-    parser.add_argument('--dump_file', type=str, default='data/dump',
-                        help='The dump file prefix.')
+    parser = argparse.ArgumentParser(
+        description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)."
+    )
+    parser.add_argument("--file_path", type=str, default="data/dump.txt", help="The path to the data.")
+    parser.add_argument("--tokenizer_type", type=str, default="bert", choices=["bert", "roberta", "gpt2"])
+    parser.add_argument("--tokenizer_name", type=str, default="bert-base-uncased", help="The tokenizer to use.")
+    parser.add_argument("--dump_file", type=str, default="data/dump", help="The dump file prefix.")
    args = parser.parse_args()

-
-    logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
-    if args.tokenizer_type == 'bert':
+    logger.info(f"Loading Tokenizer ({args.tokenizer_name})")
+    if args.tokenizer_type == "bert":
        tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]`
-        sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]`
-    elif args.tokenizer_type == 'roberta':
+        bos = tokenizer.special_tokens_map["cls_token"]  # `[CLS]`
+        sep = tokenizer.special_tokens_map["sep_token"]  # `[SEP]`
+    elif args.tokenizer_type == "roberta":
        tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map['cls_token'] # `<s>`
-        sep = tokenizer.special_tokens_map['sep_token'] # `</s>`
-    elif args.tokenizer_type == 'gpt2':
+        bos = tokenizer.special_tokens_map["cls_token"]  # `<s>`
+        sep = tokenizer.special_tokens_map["sep_token"]  # `</s>`
+    elif args.tokenizer_type == "gpt2":
        tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
-        bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>`
-        sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>`    
+        bos = tokenizer.special_tokens_map["bos_token"]  # `<|endoftext|>`
+        sep = tokenizer.special_tokens_map["eos_token"]  # `<|endoftext|>`

-    logger.info(f'Loading text from {args.file_path}')
-    with open(args.file_path, 'r', encoding='utf8') as fp:
+    logger.info(f"Loading text from {args.file_path}")
+    with open(args.file_path, "r", encoding="utf8") as fp:
        data = fp.readlines()

-
-    logger.info(f'Start encoding')
-    logger.info(f'{len(data)} examples to process.')
+    logger.info(f"Start encoding")
+    logger.info(f"{len(data)} examples to process.")

    rslt = []
    iter = 0
    interval = 10000
    start = time.time()
    for text in data:
-        text = f'{bos} {text.strip()} {sep}'
-        token_ids = tokenizer.encode(text)
+        text = f"{bos} {text.strip()} {sep}"
+        token_ids = tokenizer.encode(text, add_special_tokens=False)
        rslt.append(token_ids)

        iter += 1
        if iter % interval == 0:
            end = time.time()
-            logger.info(f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl')
+            logger.info(f"{iter} examples processed. - {(end-start):.2f}s/{interval}expl")
            start = time.time()
-    logger.info('Finished binarization')
-    logger.info(f'{len(data)} examples processed.')
+    logger.info("Finished binarization")
+    logger.info(f"{len(data)} examples processed.")

-
-    dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle'
-    rslt_ = [np.uint16(d) for d in rslt]
+    dp_file = f"{args.dump_file}.{args.tokenizer_name}.pickle"
+    vocab_size = tokenizer.vocab_size
+    if vocab_size < (1 << 16):
+        rslt_ = [np.uint16(d) for d in rslt]
+    else:
+        rslt_ = [np.int32(d) for d in rslt]
    random.shuffle(rslt_)
-    logger.info(f'Dump to {dp_file}')
-    with open(dp_file, 'wb') as handle:
+    logger.info(f"Dump to {dp_file}")
+    with open(dp_file, "wb") as handle:
        pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)


--- a/examples/distillation/scripts/extract.py
+++ b/examples/distillation/scripts/extract.py
@@ -16,74 +16,87 @@
 Preprocessing script before training the distilled model.
 Specific to RoBERTa -> DistilRoBERTa and GPT2 -> DistilGPT2.
 """
-from transformers import BertForMaskedLM, RobertaForMaskedLM, GPT2LMHeadModel
-import torch
 import argparse

-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation")
+import torch
+
+from transformers import GPT2LMHeadModel, RobertaForMaskedLM
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation"
+    )
    parser.add_argument("--model_type", default="roberta", choices=["roberta", "gpt2"])
-    parser.add_argument("--model_name", default='roberta-large', type=str)
-    parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_roberta_048131723.pth', type=str)
-    parser.add_argument("--vocab_transform", action='store_true')
+    parser.add_argument("--model_name", default="roberta-large", type=str)
+    parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_roberta_048131723.pth", type=str)
+    parser.add_argument("--vocab_transform", action="store_true")
    args = parser.parse_args()

-
-    if args.model_type == 'roberta':
+    if args.model_type == "roberta":
        model = RobertaForMaskedLM.from_pretrained(args.model_name)
-        prefix = 'roberta'
-    elif args.model_type == 'gpt2':
+        prefix = "roberta"
+    elif args.model_type == "gpt2":
        model = GPT2LMHeadModel.from_pretrained(args.model_name)
-        prefix = 'transformer'
+        prefix = "transformer"

    state_dict = model.state_dict()
    compressed_sd = {}

-    ### Embeddings ###
-    if args.model_type == 'gpt2':
-        for param_name in ['wte.weight', 'wpe.weight']:
-            compressed_sd[f'{prefix}.{param_name}'] = state_dict[f'{prefix}.{param_name}']
+    # Embeddings #
+    if args.model_type == "gpt2":
+        for param_name in ["wte.weight", "wpe.weight"]:
+            compressed_sd[f"{prefix}.{param_name}"] = state_dict[f"{prefix}.{param_name}"]
    else:
-        for w in ['word_embeddings', 'position_embeddings', 'token_type_embeddings']:
-            param_name = f'{prefix}.embeddings.{w}.weight'
+        for w in ["word_embeddings", "position_embeddings", "token_type_embeddings"]:
+            param_name = f"{prefix}.embeddings.{w}.weight"
            compressed_sd[param_name] = state_dict[param_name]
-        for w in ['weight', 'bias']:
-            param_name = f'{prefix}.embeddings.LayerNorm.{w}'
+        for w in ["weight", "bias"]:
+            param_name = f"{prefix}.embeddings.LayerNorm.{w}"
            compressed_sd[param_name] = state_dict[param_name]

-    ### Transformer Blocks ###
+    # Transformer Blocks #
    std_idx = 0
    for teacher_idx in [0, 2, 4, 7, 9, 11]:
-        if args.model_type == 'gpt2':
-            for layer in ['ln_1', 'attn.c_attn', 'attn.c_proj', 'ln_2', 'mlp.c_fc', 'mlp.c_proj']:
-                for w in ['weight', 'bias']:
-                    compressed_sd[f'{prefix}.h.{std_idx}.{layer}.{w}'] = \
-                        state_dict[f'{prefix}.h.{teacher_idx}.{layer}.{w}']
-            compressed_sd[f'{prefix}.h.{std_idx}.attn.bias'] = state_dict[f'{prefix}.h.{teacher_idx}.attn.bias']
+        if args.model_type == "gpt2":
+            for layer in ["ln_1", "attn.c_attn", "attn.c_proj", "ln_2", "mlp.c_fc", "mlp.c_proj"]:
+                for w in ["weight", "bias"]:
+                    compressed_sd[f"{prefix}.h.{std_idx}.{layer}.{w}"] = state_dict[
+                        f"{prefix}.h.{teacher_idx}.{layer}.{w}"
+                    ]
+            compressed_sd[f"{prefix}.h.{std_idx}.attn.bias"] = state_dict[f"{prefix}.h.{teacher_idx}.attn.bias"]
        else:
-            for layer in ['attention.self.query', 'attention.self.key', 'attention.self.value',
-                        'attention.output.dense', 'attention.output.LayerNorm',
-                        'intermediate.dense', 'output.dense', 'output.LayerNorm']:
-                for w in ['weight', 'bias']:
-                    compressed_sd[f'{prefix}.encoder.layer.{std_idx}.{layer}.{w}'] = \
-                        state_dict[f'{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}']
+            for layer in [
+                "attention.self.query",
+                "attention.self.key",
+                "attention.self.value",
+                "attention.output.dense",
+                "attention.output.LayerNorm",
+                "intermediate.dense",
+                "output.dense",
+                "output.LayerNorm",
+            ]:
+                for w in ["weight", "bias"]:
+                    compressed_sd[f"{prefix}.encoder.layer.{std_idx}.{layer}.{w}"] = state_dict[
+                        f"{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}"
+                    ]
        std_idx += 1

-    ### Language Modeling Head ###s
-    if args.model_type == 'roberta':
-        for layer in ['lm_head.decoder.weight', 'lm_head.bias']:
-            compressed_sd[f'{layer}'] = state_dict[f'{layer}']
+    # Language Modeling Head ###s
+    if args.model_type == "roberta":
+        for layer in ["lm_head.decoder.weight", "lm_head.bias"]:
+            compressed_sd[f"{layer}"] = state_dict[f"{layer}"]
        if args.vocab_transform:
-            for w in ['weight', 'bias']:
-                compressed_sd[f'lm_head.dense.{w}'] = state_dict[f'lm_head.dense.{w}']
-                compressed_sd[f'lm_head.layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}']
-    elif args.model_type == 'gpt2':
-        for w in ['weight', 'bias']:
-            compressed_sd[f'{prefix}.ln_f.{w}'] = state_dict[f'{prefix}.ln_f.{w}']
-        compressed_sd[f'lm_head.weight'] = state_dict[f'lm_head.weight']
+            for w in ["weight", "bias"]:
+                compressed_sd[f"lm_head.dense.{w}"] = state_dict[f"lm_head.dense.{w}"]
+                compressed_sd[f"lm_head.layer_norm.{w}"] = state_dict[f"lm_head.layer_norm.{w}"]
+    elif args.model_type == "gpt2":
+        for w in ["weight", "bias"]:
+            compressed_sd[f"{prefix}.ln_f.{w}"] = state_dict[f"{prefix}.ln_f.{w}"]
+        compressed_sd[f"lm_head.weight"] = state_dict[f"lm_head.weight"]

-    print(f'N layers selected for distillation: {std_idx}')
-    print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
+    print(f"N layers selected for distillation: {std_idx}")
+    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")

-    print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
+    print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
    torch.save(compressed_sd, args.dump_checkpoint)
--- a/examples/distillation/scripts/extract_distilbert.py
+++ b/examples/distillation/scripts/extract_distilbert.py
@@ -16,67 +16,77 @@
 Preprocessing script before training DistilBERT.
 Specific to BERT -> DistilBERT.
 """
-from transformers import BertForMaskedLM, RobertaForMaskedLM
-import torch
 import argparse

-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation")
+import torch
+
+from transformers import BertForMaskedLM
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation"
+    )
    parser.add_argument("--model_type", default="bert", choices=["bert"])
-    parser.add_argument("--model_name", default='bert-base-uncased', type=str)
-    parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str)
-    parser.add_argument("--vocab_transform", action='store_true')
+    parser.add_argument("--model_name", default="bert-base-uncased", type=str)
+    parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_bert-base-uncased_0247911.pth", type=str)
+    parser.add_argument("--vocab_transform", action="store_true")
    args = parser.parse_args()

-
-    if args.model_type == 'bert':
+    if args.model_type == "bert":
        model = BertForMaskedLM.from_pretrained(args.model_name)
-        prefix = 'bert'
+        prefix = "bert"
    else:
        raise ValueError(f'args.model_type should be "bert".')

    state_dict = model.state_dict()
    compressed_sd = {}

-    for w in ['word_embeddings', 'position_embeddings']:
-        compressed_sd[f'distilbert.embeddings.{w}.weight'] = \
-            state_dict[f'{prefix}.embeddings.{w}.weight']
-    for w in ['weight', 'bias']:
-        compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \
-            state_dict[f'{prefix}.embeddings.LayerNorm.{w}']
+    for w in ["word_embeddings", "position_embeddings"]:
+        compressed_sd[f"distilbert.embeddings.{w}.weight"] = state_dict[f"{prefix}.embeddings.{w}.weight"]
+    for w in ["weight", "bias"]:
+        compressed_sd[f"distilbert.embeddings.LayerNorm.{w}"] = state_dict[f"{prefix}.embeddings.LayerNorm.{w}"]

    std_idx = 0
    for teacher_idx in [0, 2, 4, 7, 9, 11]:
-        for w in ['weight', 'bias']:
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}']
+        for w in ["weight", "bias"]:
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}"
+            ]

-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}']
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}"
+            ]

-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}']
-            compressed_sd[f'distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}'] = \
-                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}"
+            ]
+            compressed_sd[f"distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}"] = state_dict[
+                f"{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}"
+            ]
        std_idx += 1

-    compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
-    compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
+    compressed_sd[f"vocab_projector.weight"] = state_dict[f"cls.predictions.decoder.weight"]
+    compressed_sd[f"vocab_projector.bias"] = state_dict[f"cls.predictions.bias"]
    if args.vocab_transform:
-        for w in ['weight', 'bias']:
-            compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
-            compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
+        for w in ["weight", "bias"]:
+            compressed_sd[f"vocab_transform.{w}"] = state_dict[f"cls.predictions.transform.dense.{w}"]
+            compressed_sd[f"vocab_layer_norm.{w}"] = state_dict[f"cls.predictions.transform.LayerNorm.{w}"]

-    print(f'N layers selected for distillation: {std_idx}')
-    print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
+    print(f"N layers selected for distillation: {std_idx}")
+    print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")

-    print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
+    print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
    torch.save(compressed_sd, args.dump_checkpoint)
--- a/examples/distillation/scripts/token_counts.py
+++ b/examples/distillation/scripts/token_counts.py
@@ -15,37 +15,42 @@
 """
 Preprocessing script before training the distilled model.
 """
-from collections import Counter
 import argparse
-import pickle
 import logging
+import pickle
+from collections import Counter

-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
 logger = logging.getLogger(__name__)

-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)")
-    parser.add_argument("--data_file", type=str, default="data/dump.bert-base-uncased.pickle",
-                        help="The binarized dataset.")
-    parser.add_argument("--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle",
-                        help="The dump file.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)"
+    )
+    parser.add_argument(
+        "--data_file", type=str, default="data/dump.bert-base-uncased.pickle", help="The binarized dataset."
+    )
+    parser.add_argument(
+        "--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle", help="The dump file."
+    )
    parser.add_argument("--vocab_size", default=30522, type=int)
    args = parser.parse_args()

-    logger.info(f'Loading data from {args.data_file}')
-    with open(args.data_file, 'rb') as fp:
+    logger.info(f"Loading data from {args.data_file}")
+    with open(args.data_file, "rb") as fp:
        data = pickle.load(fp)

-    logger.info('Counting occurences for MLM.')
+    logger.info("Counting occurences for MLM.")
    counter = Counter()
    for tk_ids in data:
        counter.update(tk_ids)
-    counts = [0]*args.vocab_size
+    counts = [0] * args.vocab_size
    for k, v in counter.items():
        counts[k] = v

-    logger.info(f'Dump to {args.token_counts_dump}')
-    with open(args.token_counts_dump, 'wb') as handle:
+    logger.info(f"Dump to {args.token_counts_dump}")
+    with open(args.token_counts_dump, "wb") as handle:
        pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL)
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@@ -16,272 +16,304 @@
 Training the distilled model.
 Supported architectures include: BERT -> DistilBERT, RoBERTa -> DistilRoBERTa, GPT2 -> DistilGPT2.
 """
-import os
 import argparse
-import pickle
 import json
+import os
+import pickle
 import shutil
+
 import numpy as np
 import torch

-from transformers import BertConfig, BertForMaskedLM, BertTokenizer
-from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizer
-from transformers import DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer
-from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
-
 from distiller import Distiller
-from utils import git_log, logger, init_gpu_params, set_seed
 from lm_seqs_dataset import LmSeqsDataset
+from transformers import (
+    BertConfig,
+    BertForMaskedLM,
+    BertTokenizer,
+    DistilBertConfig,
+    DistilBertForMaskedLM,
+    DistilBertTokenizer,
+    GPT2Config,
+    GPT2LMHeadModel,
+    GPT2Tokenizer,
+    RobertaConfig,
+    RobertaForMaskedLM,
+    RobertaTokenizer,
+)
+from utils import git_log, init_gpu_params, logger, set_seed


 MODEL_CLASSES = {
-    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
-    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
-    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
-    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)
+    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
+    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
+    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
+    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
 }

+
 def sanity_checks(args):
    """
    A bunch of args sanity checks to perform even starting...
    """
-    assert (args.mlm and args.alpha_mlm > 0.) or (not args.mlm and args.alpha_mlm == 0.)
-    assert (args.alpha_mlm > 0. and args.alpha_clm == 0.) or (args.alpha_mlm == 0. and args.alpha_clm > 0.)
+    assert (args.mlm and args.alpha_mlm > 0.0) or (not args.mlm and args.alpha_mlm == 0.0)
+    assert (args.alpha_mlm > 0.0 and args.alpha_clm == 0.0) or (args.alpha_mlm == 0.0 and args.alpha_clm > 0.0)
    if args.mlm:
        assert os.path.isfile(args.token_counts)
-        assert (args.student_type in ['roberta', 'distilbert']) and (args.teacher_type in ['roberta', 'bert'])
+        assert (args.student_type in ["roberta", "distilbert"]) and (args.teacher_type in ["roberta", "bert"])
    else:
-        assert (args.student_type in ['gpt2']) and (args.teacher_type in ['gpt2'])
+        assert (args.student_type in ["gpt2"]) and (args.teacher_type in ["gpt2"])

-    assert args.teacher_type == args.student_type or (args.student_type=='distilbert' and args.teacher_type=='bert')
+    assert args.teacher_type == args.student_type or (
+        args.student_type == "distilbert" and args.teacher_type == "bert"
+    )
    assert os.path.isfile(args.student_config)
    if args.student_pretrained_weights is not None:
        assert os.path.isfile(args.student_pretrained_weights)

-    if args.freeze_token_type_embds: assert args.student_type in ['roberta']
+    if args.freeze_token_type_embds:
+        assert args.student_type in ["roberta"]
+
+    assert args.alpha_ce >= 0.0
+    assert args.alpha_mlm >= 0.0
+    assert args.alpha_clm >= 0.0
+    assert args.alpha_mse >= 0.0
+    assert args.alpha_cos >= 0.0
+    assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.0

-    assert args.alpha_ce >= 0.
-    assert args.alpha_mlm >= 0.
-    assert args.alpha_clm >= 0.
-    assert args.alpha_mse >= 0.
-    assert args.alpha_cos >= 0.
-    assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.

 def freeze_pos_embeddings(student, args):
-    if args.student_type == 'roberta':
+    if args.student_type == "roberta":
        student.roberta.embeddings.position_embeddings.weight.requires_grad = False
-    elif args.student_type == 'gpt2':
+    elif args.student_type == "gpt2":
        student.transformer.wpe.weight.requires_grad = False

+
 def freeze_token_type_embeddings(student, args):
-    if args.student_type == 'roberta':
+    if args.student_type == "roberta":
        student.roberta.embeddings.token_type_embeddings.weight.requires_grad = False

+
 def main():
    parser = argparse.ArgumentParser(description="Training")
-    parser.add_argument("--force", action='store_true',
-                        help="Overwrite dump_path if it already exists.")
+    parser.add_argument("--force", action="store_true", help="Overwrite dump_path if it already exists.")

-    parser.add_argument("--dump_path", type=str, required=True,
-                        help="The output directory (log, checkpoints, parameters, etc.)")
-    parser.add_argument("--data_file", type=str, required=True,
-                        help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.")
+    parser.add_argument(
+        "--dump_path", type=str, required=True, help="The output directory (log, checkpoints, parameters, etc.)"
+    )
+    parser.add_argument(
+        "--data_file",
+        type=str,
+        required=True,
+        help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.",
+    )

-    parser.add_argument("--student_type", type=str, choices=["distilbert", "roberta", "gpt2"], required=True,
-                        help="The student type (DistilBERT, RoBERTa).")
-    parser.add_argument("--student_config", type=str, required=True,
-                        help="Path to the student configuration.")
-    parser.add_argument("--student_pretrained_weights", default=None, type=str,
-                        help="Load student initialization checkpoint.")
+    parser.add_argument(
+        "--student_type",
+        type=str,
+        choices=["distilbert", "roberta", "gpt2"],
+        required=True,
+        help="The student type (DistilBERT, RoBERTa).",
+    )
+    parser.add_argument("--student_config", type=str, required=True, help="Path to the student configuration.")
+    parser.add_argument(
+        "--student_pretrained_weights", default=None, type=str, help="Load student initialization checkpoint."
+    )

-    parser.add_argument("--teacher_type", choices=["bert", "roberta", "gpt2"], required=True,
-                        help="Teacher type (BERT, RoBERTa).")
-    parser.add_argument("--teacher_name", type=str, required=True,
-                        help="The teacher model.")
+    parser.add_argument(
+        "--teacher_type", choices=["bert", "roberta", "gpt2"], required=True, help="Teacher type (BERT, RoBERTa)."
+    )
+    parser.add_argument("--teacher_name", type=str, required=True, help="The teacher model.")

-    parser.add_argument("--temperature", default=2., type=float,
-                        help="Temperature for the softmax temperature.")
-    parser.add_argument("--alpha_ce", default=0.5, type=float,
-                        help="Linear weight for the distillation loss. Must be >=0.")
-    parser.add_argument("--alpha_mlm", default=0.0, type=float,
-                        help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.")
-    parser.add_argument("--alpha_clm", default=0.5, type=float,
-                        help="Linear weight for the CLM loss. Must be >=0.")
-    parser.add_argument("--alpha_mse", default=0.0, type=float,
-                        help="Linear weight of the MSE loss. Must be >=0.")
-    parser.add_argument("--alpha_cos", default=0.0, type=float,
-                        help="Linear weight of the cosine embedding loss. Must be >=0.")
+    parser.add_argument("--temperature", default=2.0, type=float, help="Temperature for the softmax temperature.")
+    parser.add_argument(
+        "--alpha_ce", default=0.5, type=float, help="Linear weight for the distillation loss. Must be >=0."
+    )
+    parser.add_argument(
+        "--alpha_mlm",
+        default=0.0,
+        type=float,
+        help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.",
+    )
+    parser.add_argument("--alpha_clm", default=0.5, type=float, help="Linear weight for the CLM loss. Must be >=0.")
+    parser.add_argument("--alpha_mse", default=0.0, type=float, help="Linear weight of the MSE loss. Must be >=0.")
+    parser.add_argument(
+        "--alpha_cos", default=0.0, type=float, help="Linear weight of the cosine embedding loss. Must be >=0."
+    )

-    parser.add_argument("--mlm", action="store_true",
-                        help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM.")
-    parser.add_argument("--mlm_mask_prop", default=0.15, type=float,
-                        help="Proportion of tokens for which we need to make a prediction.")
-    parser.add_argument("--word_mask", default=0.8, type=float,
-                        help="Proportion of tokens to mask out.")
-    parser.add_argument("--word_keep", default=0.1, type=float,
-                        help="Proportion of tokens to keep.")
-    parser.add_argument("--word_rand", default=0.1, type=float,
-                        help="Proportion of tokens to randomly replace.")
-    parser.add_argument("--mlm_smoothing", default=0.7, type=float,
-                        help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).")
-    parser.add_argument("--token_counts", type=str,
-                        help="The token counts in the data_file for MLM.")
+    parser.add_argument(
+        "--mlm", action="store_true", help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM."
+    )
+    parser.add_argument(
+        "--mlm_mask_prop",
+        default=0.15,
+        type=float,
+        help="Proportion of tokens for which we need to make a prediction.",
+    )
+    parser.add_argument("--word_mask", default=0.8, type=float, help="Proportion of tokens to mask out.")
+    parser.add_argument("--word_keep", default=0.1, type=float, help="Proportion of tokens to keep.")
+    parser.add_argument("--word_rand", default=0.1, type=float, help="Proportion of tokens to randomly replace.")
+    parser.add_argument(
+        "--mlm_smoothing",
+        default=0.7,
+        type=float,
+        help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).",
+    )
+    parser.add_argument("--token_counts", type=str, help="The token counts in the data_file for MLM.")

-    parser.add_argument("--restrict_ce_to_mask", action='store_true',
-                        help="If true, compute the distilation loss only the [MLM] prediction distribution.")
-    parser.add_argument("--freeze_pos_embs", action="store_true",
-                        help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.")
-    parser.add_argument("--freeze_token_type_embds", action="store_true",
-                        help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.")
+    parser.add_argument(
+        "--restrict_ce_to_mask",
+        action="store_true",
+        help="If true, compute the distilation loss only the [MLM] prediction distribution.",
+    )
+    parser.add_argument(
+        "--freeze_pos_embs",
+        action="store_true",
+        help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.",
+    )
+    parser.add_argument(
+        "--freeze_token_type_embds",
+        action="store_true",
+        help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.",
+    )

-    parser.add_argument("--n_epoch", type=int, default=3,
-                        help="Number of pass on the whole dataset.")
-    parser.add_argument("--batch_size", type=int, default=5,
-                        help="Batch size (for each process).")
-    parser.add_argument("--group_by_size", action='store_false',
-                        help="If true, group sequences that have similar length into the same batch. Default is true.")
+    parser.add_argument("--n_epoch", type=int, default=3, help="Number of pass on the whole dataset.")
+    parser.add_argument("--batch_size", type=int, default=5, help="Batch size (for each process).")
+    parser.add_argument(
+        "--group_by_size",
+        action="store_false",
+        help="If true, group sequences that have similar length into the same batch. Default is true.",
+    )

-    parser.add_argument("--gradient_accumulation_steps", type=int, default=50,
-                        help="Gradient accumulation for larger training batches.")
-    parser.add_argument("--warmup_prop", default=0.05, type=float,
-                        help="Linear warmup proportion.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--learning_rate", default=5e-4, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--adam_epsilon", default=1e-6, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=5.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--initializer_range", default=0.02, type=float,
-                        help="Random initialization range.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=50,
+        help="Gradient accumulation for larger training batches.",
+    )
+    parser.add_argument("--warmup_prop", default=0.05, type=float, help="Linear warmup proportion.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--learning_rate", default=5e-4, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=5.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--initializer_range", default=0.02, type=float, help="Random initialization range.")

-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--n_gpu", type=int, default=1,
-                        help="Number of GPUs in the node.")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="Distributed training - Local rank")
-    parser.add_argument("--seed", type=int, default=56,
-                        help="Random seed")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs in the node.")
+    parser.add_argument("--local_rank", type=int, default=-1, help="Distributed training - Local rank")
+    parser.add_argument("--seed", type=int, default=56, help="Random seed")

-    parser.add_argument("--log_interval", type=int, default=500,
-                        help="Tensorboard logging interval.")
-    parser.add_argument("--checkpoint_interval", type=int, default=4000,
-                        help="Checkpoint interval.")
+    parser.add_argument("--log_interval", type=int, default=500, help="Tensorboard logging interval.")
+    parser.add_argument("--checkpoint_interval", type=int, default=4000, help="Checkpoint interval.")
    args = parser.parse_args()
    sanity_checks(args)

-
-    ## ARGS ##
+    # ARGS #
    init_gpu_params(args)
    set_seed(args)
    if args.is_master:
        if os.path.exists(args.dump_path):
            if not args.force:
-                raise ValueError(f'Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it'
-                                   'Use `--force` if you want to overwrite it')
+                raise ValueError(
+                    f"Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it"
+                    "Use `--force` if you want to overwrite it"
+                )
            else:
                shutil.rmtree(args.dump_path)

        if not os.path.exists(args.dump_path):
            os.makedirs(args.dump_path)
-        logger.info(f'Experiment will be dumped and logged in {args.dump_path}')
+        logger.info(f"Experiment will be dumped and logged in {args.dump_path}")

-
-        ### SAVE PARAMS ###
-        logger.info(f'Param: {args}')
-        with open(os.path.join(args.dump_path, 'parameters.json'), 'w') as f:
+        # SAVE PARAMS #
+        logger.info(f"Param: {args}")
+        with open(os.path.join(args.dump_path, "parameters.json"), "w") as f:
            json.dump(vars(args), f, indent=4)
        git_log(args.dump_path)

    student_config_class, student_model_class, _ = MODEL_CLASSES[args.student_type]
    teacher_config_class, teacher_model_class, teacher_tokenizer_class = MODEL_CLASSES[args.teacher_type]

-    ### TOKENIZER ###
+    # TOKENIZER #
    tokenizer = teacher_tokenizer_class.from_pretrained(args.teacher_name)
    special_tok_ids = {}
    for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
        idx = tokenizer.all_special_tokens.index(tok_symbol)
        special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
-    logger.info(f'Special tokens {special_tok_ids}')
+    logger.info(f"Special tokens {special_tok_ids}")
    args.special_tok_ids = special_tok_ids
    args.max_model_input_size = tokenizer.max_model_input_sizes[args.teacher_name]

-
-    ## DATA LOADER ##
-    logger.info(f'Loading data from {args.data_file}')
-    with open(args.data_file, 'rb') as fp:
+    # DATA LOADER #
+    logger.info(f"Loading data from {args.data_file}")
+    with open(args.data_file, "rb") as fp:
        data = pickle.load(fp)

-
    if args.mlm:
-        logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)')
-        with open(args.token_counts, 'rb') as fp:
+        logger.info(f"Loading token counts from {args.token_counts} (already pre-computed)")
+        with open(args.token_counts, "rb") as fp:
            counts = pickle.load(fp)
-        
+
        token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
        for idx in special_tok_ids.values():
-            token_probs[idx] = 0.  # do not predict special tokens
+            token_probs[idx] = 0.0  # do not predict special tokens
        token_probs = torch.from_numpy(token_probs)
    else:
        token_probs = None

-
    train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
-    logger.info(f'Data loader created.')
+    logger.info(f"Data loader created.")

-
-    ## STUDENT ##
-    logger.info(f'Loading student config from {args.student_config}')
+    # STUDENT #
+    logger.info(f"Loading student config from {args.student_config}")
    stu_architecture_config = student_config_class.from_pretrained(args.student_config)
    stu_architecture_config.output_hidden_states = True

    if args.student_pretrained_weights is not None:
-        logger.info(f'Loading pretrained weights from {args.student_pretrained_weights}')
-        student = student_model_class.from_pretrained(args.student_pretrained_weights,
-                                                      config=stu_architecture_config)
+        logger.info(f"Loading pretrained weights from {args.student_pretrained_weights}")
+        student = student_model_class.from_pretrained(args.student_pretrained_weights, config=stu_architecture_config)
    else:
        student = student_model_class(stu_architecture_config)

-
    if args.n_gpu > 0:
-        student.to(f'cuda:{args.local_rank}')
-    logger.info(f'Student loaded.')
+        student.to(f"cuda:{args.local_rank}")
+    logger.info(f"Student loaded.")

-
-    ## TEACHER ##
+    # TEACHER #
    teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
    if args.n_gpu > 0:
-        teacher.to(f'cuda:{args.local_rank}')
-    logger.info(f'Teacher loaded from {args.teacher_name}.')
+        teacher.to(f"cuda:{args.local_rank}")
+    logger.info(f"Teacher loaded from {args.teacher_name}.")

-
-    ## FREEZING ##
+    # FREEZING #
    if args.freeze_pos_embs:
        freeze_pos_embeddings(student, args)
    if args.freeze_token_type_embds:
        freeze_token_type_embeddings(student, args)

-
-    ## SANITY CHECKS ##
+    # SANITY CHECKS #
    assert student.config.vocab_size == teacher.config.vocab_size
    assert student.config.hidden_size == teacher.config.hidden_size
    assert student.config.max_position_embeddings == teacher.config.max_position_embeddings
    if args.mlm:
        assert token_probs.size(0) == stu_architecture_config.vocab_size

-
-    ## DISTILLER ##
+    # DISTILLER #
    torch.cuda.empty_cache()
-    distiller = Distiller(params=args,
-                          dataset=train_lm_seq_dataset,
-                          token_probs=token_probs,
-                          student=student,
-                          teacher=teacher)
+    distiller = Distiller(
+        params=args, dataset=train_lm_seq_dataset, token_probs=token_probs, student=student, teacher=teacher
+    )
    distiller.train()
    logger.info("Let's go get some drinks.")

--- a/examples/distillation/training_configs/distilbert-base-cased.json
+++ b/examples/distillation/training_configs/distilbert-base-cased.json
@@ -0,0 +1,15 @@
+{
+	"activation": "gelu",
+	"attention_dropout": 0.1,
+	"dim": 768,
+	"dropout": 0.1,
+	"hidden_dim": 3072,
+	"initializer_range": 0.02,
+	"max_position_embeddings": 512,
+	"n_heads": 12,
+	"n_layers": 6,
+	"sinusoidal_pos_embds": true,
+	"tie_weights_": true,
+	"vocab_size": 28996
+  }
+  
--- a/examples/distillation/training_configs/distilbert-base-multilingual-cased.json
+++ b/examples/distillation/training_configs/distilbert-base-multilingual-cased.json
@@ -0,0 +1,15 @@
+{
+	"activation": "gelu",
+	"attention_dropout": 0.1,
+	"dim": 768,
+	"dropout": 0.1,
+	"hidden_dim": 3072,
+	"initializer_range": 0.02,
+	"max_position_embeddings": 512,
+	"n_heads": 12,
+	"n_layers": 6,
+	"sinusoidal_pos_embds": true,
+	"tie_weights_": true,
+	"vocab_size": 119547
+  }
+  
--- a/examples/distillation/training_configs/distilroberta-base.json
+++ b/examples/distillation/training_configs/distilroberta-base.json
@@ -0,0 +1,14 @@
+{
+    "vocab_size": 50265,
+    "hidden_size": 768,
+    "num_hidden_layers": 6,
+    "num_attention_heads": 12,
+    "intermediate_size": 3072,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "attention_probs_dropout_prob": 0.1,
+    "max_position_embeddings": 514,
+    "type_vocab_size": 1,
+    "initializer_range": 0.02,
+    "layer_norm_eps": 0.00001
+}
--- a/examples/distillation/utils.py
+++ b/examples/distillation/utils.py
@@ -15,17 +15,21 @@
 """ Utils to train DistilBERT
    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
-import git
 import json
+import logging
 import os
 import socket
-import torch
-import numpy as np

-import logging
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+import git
+import numpy as np
+import torch
+
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d -  %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
 logger = logging.getLogger(__name__)


@@ -35,12 +39,12 @@ def git_log(folder_path: str):
    """
    repo = git.Repo(search_parent_directories=True)
    repo_infos = {
-        'repo_id': str(repo),
-        'repo_sha': str(repo.head.object.hexsha),
-        'repo_branch': str(repo.active_branch)
+        "repo_id": str(repo),
+        "repo_sha": str(repo.head.object.hexsha),
+        "repo_branch": str(repo.active_branch),
    }

-    with open(os.path.join(folder_path, 'git_log.json'), 'w') as f:
+    with open(os.path.join(folder_path, "git_log.json"), "w") as f:
        json.dump(repo_infos, f, indent=4)


@@ -57,21 +61,21 @@ def init_gpu_params(params):

    assert torch.cuda.is_available()

-    logger.info('Initializing GPUs')
+    logger.info("Initializing GPUs")
    if params.n_gpu > 1:
        assert params.local_rank != -1

-        params.world_size = int(os.environ['WORLD_SIZE'])
-        params.n_gpu_per_node = int(os.environ['N_GPU_NODE'])
-        params.global_rank = int(os.environ['RANK'])
+        params.world_size = int(os.environ["WORLD_SIZE"])
+        params.n_gpu_per_node = int(os.environ["N_GPU_NODE"])
+        params.global_rank = int(os.environ["RANK"])

        # number of nodes / node ID
        params.n_nodes = params.world_size // params.n_gpu_per_node
        params.node_id = params.global_rank // params.n_gpu_per_node
        params.multi_gpu = True

-        assert params.n_nodes == int(os.environ['N_NODES'])
-        assert params.node_id == int(os.environ['NODE_RANK'])
+        assert params.n_nodes == int(os.environ["N_NODES"])
+        assert params.node_id == int(os.environ["NODE_RANK"])

    # local job (single GPU)
    else:
@@ -114,8 +118,7 @@ def init_gpu_params(params):
    if params.multi_gpu:
        logger.info("Initializing PyTorch distributed")
        torch.distributed.init_process_group(
-            init_method='env://',
-            backend='nccl',
+            init_method="env://", backend="nccl",
        )


--- a/examples/hans/hans_processors.py
+++ b/examples/hans/hans_processors.py
@@ -0,0 +1,221 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" GLUE processors and helpers """
+
+import logging
+import os
+
+from transformers.file_utils import is_tf_available
+from utils_hans import DataProcessor, InputExample, InputFeatures
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+logger = logging.getLogger(__name__)
+
+
+def hans_convert_examples_to_features(
+    examples,
+    tokenizer,
+    max_length=512,
+    task=None,
+    label_list=None,
+    output_mode=None,
+    pad_on_left=False,
+    pad_token=0,
+    pad_token_segment_id=0,
+    mask_padding_with_zero=True,
+):
+    """
+    Loads a data file into a list of ``InputFeatures``
+
+    Args:
+        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
+        tokenizer: Instance of a tokenizer that will tokenize the examples
+        max_length: Maximum example length
+        task: HANS
+        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
+        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
+        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
+        pad_token: Padding token
+        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
+        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
+            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
+            actual values)
+
+    Returns:
+        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
+        containing the task-specific features. If the input is a list of ``InputExamples``, will return
+        a list of task-specific ``InputFeatures`` which can be fed to the model.
+
+    """
+    is_tf_dataset = False
+    if is_tf_available() and isinstance(examples, tf.data.Dataset):
+        is_tf_dataset = True
+
+    if task is not None:
+        processor = glue_processors[task]()
+        if label_list is None:
+            label_list = processor.get_labels()
+            logger.info("Using label list %s for task %s" % (label_list, task))
+        if output_mode is None:
+            output_mode = glue_output_modes[task]
+            logger.info("Using output mode %s for task %s" % (output_mode, task))
+
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d" % (ex_index))
+        if is_tf_dataset:
+            example = processor.get_example_from_tensor_dict(example)
+            example = processor.tfds_map(example)
+
+        inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length,)
+        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        padding_length = max_length - len(input_ids)
+        if pad_on_left:
+            input_ids = ([pad_token] * padding_length) + input_ids
+            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
+            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
+        else:
+            input_ids = input_ids + ([pad_token] * padding_length)
+            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
+
+        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
+        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(
+            len(attention_mask), max_length
+        )
+        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(
+            len(token_type_ids), max_length
+        )
+
+        if output_mode == "classification":
+            label = label_map[example.label] if example.label in label_map else 0
+        elif output_mode == "regression":
+            label = float(example.label)
+        else:
+            raise KeyError(output_mode)
+        pairID = str(example.pairID)
+
+        if ex_index < 10:
+            logger.info("*** Example ***")
+            logger.info("text_a: %s" % (example.text_a))
+            logger.info("text_b: %s" % (example.text_b))
+            logger.info("guid: %s" % (example.guid))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
+            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
+            logger.info("label: %s (id = %d)" % (example.label, label))
+
+        features.append(
+            InputFeatures(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                label=label,
+                pairID=pairID,
+            )
+        )
+
+    if is_tf_available() and is_tf_dataset:
+
+        def gen():
+            for ex in features:
+                yield (
+                    {
+                        "input_ids": ex.input_ids,
+                        "attention_mask": ex.attention_mask,
+                        "token_type_ids": ex.token_type_ids,
+                    },
+                    ex.label,
+                )
+
+        return tf.data.Dataset.from_generator(
+            gen,
+            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
+            (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "token_type_ids": tf.TensorShape([None]),
+                },
+                tf.TensorShape([]),
+            ),
+        )
+
+    return features
+
+
+class HansProcessor(DataProcessor):
+    """Processor for the HANS data set."""
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["premise"].numpy().decode("utf-8"),
+            tensor_dict["hypothesis"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[5]
+            text_b = line[6]
+            pairID = line[7][2:] if line[7].startswith("ex") else line[7]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID))
+        return examples
+
+
+glue_tasks_num_labels = {
+    "hans": 3,
+}
+
+glue_processors = {
+    "hans": HansProcessor,
+}
+
+glue_output_modes = {
+    "hans": "classification",
+}
--- a/examples/hans/test_hans.py
+++ b/examples/hans/test_hans.py
@@ -0,0 +1,643 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import glob
+import logging
+import os
+import random
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+from hans_processors import glue_output_modes as output_modes
+from hans_processors import glue_processors as processors
+from hans_processors import hans_convert_examples_to_features as convert_examples_to_features
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    AlbertConfig,
+    AlbertForSequenceClassification,
+    AlbertTokenizer,
+    BertConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+    DistilBertConfig,
+    DistilBertForSequenceClassification,
+    DistilBertTokenizer,
+    RobertaConfig,
+    RobertaForSequenceClassification,
+    RobertaTokenizer,
+    XLMConfig,
+    XLMForSequenceClassification,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForSequenceClassification,
+    XLNetTokenizer,
+    get_linear_schedule_with_warmup,
+)
+
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    from tensorboardX import SummaryWriter
+
+
+logger = logging.getLogger(__name__)
+
+ALL_MODELS = sum(
+    (
+        tuple(conf.pretrained_config_archive_map.keys())
+        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
+    ),
+    (),
+)
+
+MODEL_CLASSES = {
+    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
+    "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
+    "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
+}
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def train(args, train_dataset, model, tokenizer):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
+
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert", "xlnet"] else None
+                )  # XLM, DistilBERT and RoBERTa don't use segment_ids
+            outputs = model(**inputs)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+
+            if args.n_gpu > 1:
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    logs = {}
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            eval_key = "eval_{}".format(key)
+                            logs[eval_key] = value
+
+                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
+                    learning_rate_scalar = scheduler.get_lr()[0]
+                    logs["learning_rate"] = learning_rate_scalar
+                    logs["loss"] = loss_scalar
+                    logging_loss = tr_loss
+
+                    for key, value in logs.items():
+                        tb_writer.add_scalar(key, value, global_step)
+                    # print(json.dumps({**logs, **{'step': global_step}}))
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    # Loop to handle MNLI double evaluation (matched, mis-matched)
+    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
+    eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
+
+    results = {}
+    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
+        eval_dataset, label_list = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
+
+        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(eval_output_dir)
+
+        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+        # Note that DistributedSampler samples randomly
+        eval_sampler = SequentialSampler(eval_dataset)
+        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+        # multi-gpu eval
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
+        # Eval!
+        logger.info("***** Running evaluation {} *****".format(prefix))
+        logger.info("  Num examples = %d", len(eval_dataset))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        eval_loss = 0.0
+        nb_eval_steps = 0
+        preds = None
+        out_label_ids = None
+        for batch in tqdm(eval_dataloader, desc="Evaluating"):
+            model.eval()
+            batch = tuple(t.to(args.device) for t in batch)
+
+            with torch.no_grad():
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+                if args.model_type != "distilbert":
+                    inputs["token_type_ids"] = (
+                        batch[2] if args.model_type in ["bert", "xlnet"] else None
+                    )  # XLM, DistilBERT and RoBERTa don't use segment_ids
+                outputs = model(**inputs)
+                tmp_eval_loss, logits = outputs[:2]
+
+                eval_loss += tmp_eval_loss.mean().item()
+            nb_eval_steps += 1
+            if preds is None:
+                preds = logits.detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
+                pair_ids = batch[4].detach().cpu().numpy()
+            else:
+                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
+                pair_ids = np.append(pair_ids, batch[4].detach().cpu().numpy(), axis=0)
+
+        eval_loss = eval_loss / nb_eval_steps
+        if args.output_mode == "classification":
+            preds = np.argmax(preds, axis=1)
+        elif args.output_mode == "regression":
+            preds = np.squeeze(preds)
+
+        output_eval_file = os.path.join(eval_output_dir, "hans_predictions.txt")
+        with open(output_eval_file, "w") as writer:
+            writer.write("pairID,gld_label\n")
+            for pid, pred in zip(pair_ids, preds):
+                writer.write("ex" + str(pid) + "," + label_list[int(pred)] + "\n")
+
+    return results
+
+
+def load_and_cache_examples(args, task, tokenizer, evaluate=False):
+    if args.local_rank not in [-1, 0] and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    processor = processors[task]()
+    output_mode = output_modes[task]
+    # Load data features from cache or dataset file
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+            str(task),
+        ),
+    )
+
+    label_list = processor.get_labels()
+
+    if os.path.exists(cached_features_file) and not args.overwrite_cache:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+        if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta"]:
+            # HACK(label indices are swapped in RoBERTa pretrained model)
+            label_list[1], label_list[2] = label_list[2], label_list[1]
+        examples = (
+            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+        )
+        features = convert_examples_to_features(
+            examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
+            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+        )
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    if args.local_rank == 0 and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+    if output_mode == "classification":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+    all_pair_ids = torch.tensor([int(f.pairID) for f in features], dtype=torch.long)
+
+    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_pair_ids)
+    return dataset, label_list
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+
+    # Other parameters
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
+    args = parser.parse_args()
+
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend="nccl")
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
+
+    # Set seed
+    set_seed(args)
+
+    # Prepare GLUE task
+    args.task_name = args.task_name.lower()
+    if args.task_name not in processors:
+        raise ValueError("Task not found: %s" % (args.task_name))
+    processor = processors[args.task_name]()
+    args.output_mode = output_modes[args.task_name]
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    model.to(args.device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Training
+    if args.do_train:
+        train_dataset, _ = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+
+    # Evaluation
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, prefix=prefix)
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
+            results.update(result)
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
--- a/transformers/data/processors/utils.py
+++ b/transformers/data/processors/utils.py
@@ -14,11 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import csv
-import sys
 import copy
+import csv
 import json

+
 class InputExample(object):
    """
    A single training/test example for simple sequence classification.
@@ -32,11 +32,13 @@ class InputExample(object):
        label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
-    def __init__(self, guid, text_a, text_b=None, label=None):
+
+    def __init__(self, guid, text_a, text_b=None, label=None, pairID=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label
+        self.pairID = pairID

    def __repr__(self):
        return str(self.to_json_string())
@@ -64,11 +66,12 @@ class InputFeatures(object):
        label: Label corresponding to the input
    """

-    def __init__(self, input_ids, attention_mask, token_type_ids, label):
+    def __init__(self, input_ids, attention_mask, token_type_ids, label, pairID=None):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.label = label
+        self.pairID = pairID

    def __repr__(self):
        return str(self.to_json_string())
@@ -114,7 +117,5 @@ class DataProcessor(object):
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
-                if sys.version_info[0] == 2:
-                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines
--- a/examples/mm-imdb/run_mmimdb.py
+++ b/examples/mm-imdb/run_mmimdb.py
@@ -0,0 +1,614 @@
+# coding=utf-8
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for multimodal multiclass prediction on MM-IMDB dataset."""
+
+
+import argparse
+import glob
+import json
+import logging
+import os
+import random
+
+import numpy as np
+import torch
+import torch.nn as nn
+from sklearn.metrics import f1_score
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    AlbertConfig,
+    AlbertModel,
+    AlbertTokenizer,
+    BertConfig,
+    BertModel,
+    BertTokenizer,
+    DistilBertConfig,
+    DistilBertModel,
+    DistilBertTokenizer,
+    MMBTConfig,
+    MMBTForClassification,
+    RobertaConfig,
+    RobertaModel,
+    RobertaTokenizer,
+    XLMConfig,
+    XLMModel,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetModel,
+    XLNetTokenizer,
+    get_linear_schedule_with_warmup,
+)
+from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_transforms, get_mmimdb_labels
+
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    from tensorboardX import SummaryWriter
+
+
+logger = logging.getLogger(__name__)
+
+ALL_MODELS = sum(
+    (
+        tuple(conf.pretrained_config_archive_map.keys())
+        for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig)
+    ),
+    (),
+)
+
+MODEL_CLASSES = {
+    "bert": (BertConfig, BertModel, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetModel, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMModel, XLMTokenizer),
+    "roberta": (RobertaConfig, RobertaModel, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertModel, DistilBertTokenizer),
+    "albert": (AlbertConfig, AlbertModel, AlbertTokenizer),
+}
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def train(args, train_dataset, model, tokenizer, criterion):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(
+        train_dataset,
+        sampler=train_sampler,
+        batch_size=args.train_batch_size,
+        collate_fn=collate_fn,
+        num_workers=args.num_workers,
+    )
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
+
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    best_f1, n_no_improve = 0, 0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            labels = batch[5]
+            inputs = {
+                "input_ids": batch[0],
+                "input_modal": batch[2],
+                "attention_mask": batch[1],
+                "modal_start_tokens": batch[3],
+                "modal_end_tokens": batch[4],
+            }
+            outputs = model(**inputs)
+            logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
+            loss = criterion(logits, labels)
+
+            if args.n_gpu > 1:
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    logs = {}
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer, criterion)
+                        for key, value in results.items():
+                            eval_key = "eval_{}".format(key)
+                            logs[eval_key] = value
+
+                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
+                    learning_rate_scalar = scheduler.get_lr()[0]
+                    logs["learning_rate"] = learning_rate_scalar
+                    logs["loss"] = loss_scalar
+                    logging_loss = tr_loss
+
+                    for key, value in logs.items():
+                        tb_writer.add_scalar(key, value, global_step)
+                    print(json.dumps({**logs, **{"step": global_step}}))
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
+                    torch.save(model_to_save.state_dict(), os.path.join(output_dir, WEIGHTS_NAME))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+        if args.local_rank == -1:
+            results = evaluate(args, model, tokenizer, criterion)
+            if results["micro_f1"] > best_f1:
+                best_f1 = results["micro_f1"]
+                n_no_improve = 0
+            else:
+                n_no_improve += 1
+
+            if n_no_improve > args.patience:
+                train_iterator.close()
+                break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, criterion, prefix=""):
+    # Loop to handle MNLI double evaluation (matched, mis-matched)
+    eval_output_dir = args.output_dir
+    eval_dataset = load_examples(args, tokenizer, evaluate=True)
+
+    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(eval_output_dir)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    # Note that DistributedSampler samples randomly
+    eval_sampler = SequentialSampler(eval_dataset)
+    eval_dataloader = DataLoader(
+        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn
+    )
+
+    # multi-gpu eval
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Eval!
+    logger.info("***** Running evaluation {} *****".format(prefix))
+    logger.info("  Num examples = %d", len(eval_dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    eval_loss = 0.0
+    nb_eval_steps = 0
+    preds = None
+    out_label_ids = None
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        model.eval()
+        batch = tuple(t.to(args.device) for t in batch)
+
+        with torch.no_grad():
+            batch = tuple(t.to(args.device) for t in batch)
+            labels = batch[5]
+            inputs = {
+                "input_ids": batch[0],
+                "input_modal": batch[2],
+                "attention_mask": batch[1],
+                "modal_start_tokens": batch[3],
+                "modal_end_tokens": batch[4],
+            }
+            outputs = model(**inputs)
+            logits = outputs[0]  # model outputs are always tuple in transformers (see doc)
+            tmp_eval_loss = criterion(logits, labels)
+            eval_loss += tmp_eval_loss.mean().item()
+        nb_eval_steps += 1
+        if preds is None:
+            preds = torch.sigmoid(logits).detach().cpu().numpy() > 0.5
+            out_label_ids = labels.detach().cpu().numpy()
+        else:
+            preds = np.append(preds, torch.sigmoid(logits).detach().cpu().numpy() > 0.5, axis=0)
+            out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)
+
+    eval_loss = eval_loss / nb_eval_steps
+    result = {
+        "loss": eval_loss,
+        "macro_f1": f1_score(out_label_ids, preds, average="macro"),
+        "micro_f1": f1_score(out_label_ids, preds, average="micro"),
+    }
+
+    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
+    with open(output_eval_file, "w") as writer:
+        logger.info("***** Eval results {} *****".format(prefix))
+        for key in sorted(result.keys()):
+            logger.info("  %s = %s", key, str(result[key]))
+            writer.write("%s = %s\n" % (key, str(result[key])))
+
+    return result
+
+
+def load_examples(args, tokenizer, evaluate=False):
+    path = os.path.join(args.data_dir, "dev.jsonl" if evaluate else "train.jsonl")
+    transforms = get_image_transforms()
+    labels = get_mmimdb_labels()
+    dataset = JsonlDataset(path, tokenizer, transforms, labels, args.max_seq_length - args.num_image_embeds - 2)
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .jsonl files for MMIMDB.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+
+    # Other parameters
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument(
+        "--num_image_embeds", default=1, type=int, help="Number of Image Embeddings from the Image Encoder"
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument("--patience", default=5, type=int, help="Patience for Early Stopping.")
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument("--num_workers", type=int, default=8, help="number of worker threads for dataloading")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
+    args = parser.parse_args()
+
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend="nccl")
+        args.n_gpu = 1
+
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
+
+    # Set seed
+    set_seed(args)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    # Setup model
+    labels = get_mmimdb_labels()
+    num_labels = len(labels)
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    transformer_config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    transformer = model_class.from_pretrained(
+        args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir if args.cache_dir else None
+    )
+    img_encoder = ImageEncoder(args)
+    config = MMBTConfig(transformer_config, num_labels=num_labels)
+    model = MMBTForClassification(config, transformer, img_encoder)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    model.to(args.device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Training
+    if args.do_train:
+        train_dataset = load_examples(args, tokenizer, evaluate=False)
+        label_frequences = train_dataset.get_label_frequencies()
+        label_frequences = [label_frequences[l] for l in labels]
+        label_weights = (
+            torch.tensor(label_frequences, device=args.device, dtype=torch.float) / len(train_dataset)
+        ) ** -1
+        criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer, criterion)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
+        torch.save(model_to_save.state_dict(), os.path.join(args.output_dir, WEIGHTS_NAME))
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = MMBTForClassification(config, transformer, img_encoder)
+        model.load_state_dict(torch.load(os.path.join(args.output_dir, WEIGHTS_NAME)))
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+
+    # Evaluation
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+            model = MMBTForClassification(config, transformer, img_encoder)
+            model.load_state_dict(torch.load(checkpoint))
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, criterion, prefix=prefix)
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
+            results.update(result)
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/mm-imdb/utils_mmimdb.py
+++ b/examples/mm-imdb/utils_mmimdb.py
@@ -0,0 +1,143 @@
+# coding=utf-8
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from collections import Counter
+
+import torch
+import torch.nn as nn
+import torchvision
+import torchvision.transforms as transforms
+from PIL import Image
+from torch.utils.data import Dataset
+
+
+POOLING_BREAKDOWN = {1: (1, 1), 2: (2, 1), 3: (3, 1), 4: (2, 2), 5: (5, 1), 6: (3, 2), 7: (7, 1), 8: (4, 2), 9: (3, 3)}
+
+
+class ImageEncoder(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        model = torchvision.models.resnet152(pretrained=True)
+        modules = list(model.children())[:-2]
+        self.model = nn.Sequential(*modules)
+        self.pool = nn.AdaptiveAvgPool2d(POOLING_BREAKDOWN[args.num_image_embeds])
+
+    def forward(self, x):
+        # Bx3x224x224 -> Bx2048x7x7 -> Bx2048xN -> BxNx2048
+        out = self.pool(self.model(x))
+        out = torch.flatten(out, start_dim=2)
+        out = out.transpose(1, 2).contiguous()
+        return out  # BxNx2048
+
+
+class JsonlDataset(Dataset):
+    def __init__(self, data_path, tokenizer, transforms, labels, max_seq_length):
+        self.data = [json.loads(l) for l in open(data_path)]
+        self.data_dir = os.path.dirname(data_path)
+        self.tokenizer = tokenizer
+        self.labels = labels
+        self.n_classes = len(labels)
+        self.max_seq_length = max_seq_length
+
+        self.transforms = transforms
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, index):
+        sentence = torch.LongTensor(self.tokenizer.encode(self.data[index]["text"], add_special_tokens=True))
+        start_token, sentence, end_token = sentence[0], sentence[1:-1], sentence[-1]
+        sentence = sentence[: self.max_seq_length]
+
+        label = torch.zeros(self.n_classes)
+        label[[self.labels.index(tgt) for tgt in self.data[index]["label"]]] = 1
+
+        image = Image.open(os.path.join(self.data_dir, self.data[index]["img"])).convert("RGB")
+        image = self.transforms(image)
+
+        return {
+            "image_start_token": start_token,
+            "image_end_token": end_token,
+            "sentence": sentence,
+            "image": image,
+            "label": label,
+        }
+
+    def get_label_frequencies(self):
+        label_freqs = Counter()
+        for row in self.data:
+            label_freqs.update(row["label"])
+        return label_freqs
+
+
+def collate_fn(batch):
+    lens = [len(row["sentence"]) for row in batch]
+    bsz, max_seq_len = len(batch), max(lens)
+
+    mask_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)
+    text_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)
+
+    for i_batch, (input_row, length) in enumerate(zip(batch, lens)):
+        text_tensor[i_batch, :length] = input_row["sentence"]
+        mask_tensor[i_batch, :length] = 1
+
+    img_tensor = torch.stack([row["image"] for row in batch])
+    tgt_tensor = torch.stack([row["label"] for row in batch])
+    img_start_token = torch.stack([row["image_start_token"] for row in batch])
+    img_end_token = torch.stack([row["image_end_token"] for row in batch])
+
+    return text_tensor, mask_tensor, img_tensor, img_start_token, img_end_token, tgt_tensor
+
+
+def get_mmimdb_labels():
+    return [
+        "Crime",
+        "Drama",
+        "Thriller",
+        "Action",
+        "Comedy",
+        "Romance",
+        "Documentary",
+        "Short",
+        "Mystery",
+        "History",
+        "Family",
+        "Adventure",
+        "Fantasy",
+        "Sci-Fi",
+        "Western",
+        "Horror",
+        "Sport",
+        "War",
+        "Music",
+        "Musical",
+        "Animation",
+        "Biography",
+        "Film-Noir",
+    ]
+
+
+def get_image_transforms():
+    return transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.46777044, 0.44531429, 0.40661017], std=[0.12221994, 0.12145835, 0.14380469],),
+        ]
+    )
--- a/examples/pplm/README.md
+++ b/examples/pplm/README.md
@@ -0,0 +1,54 @@
+# Plug and Play Language Models: a Simple Approach to Controlled Text Generation
+
+Authors: [Sumanth Dathathri](https://dathath.github.io/), [Andrea Madotto](https://andreamad8.github.io/), Janice Lan, Jane Hung, Eric Frank, [Piero Molino](https://w4nderlu.st/), [Jason Yosinski](http://yosinski.com/), and [Rosanne Liu](http://www.rosanneliu.com/)
+
+This folder contains the original code used to run the Plug and Play Language Model (PPLM).
+
+Paper link: https://arxiv.org/abs/1912.02164
+
+Blog link: https://eng.uber.com/pplm
+
+Please check out the repo under uber-research for more information: https://github.com/uber-research/PPLM
+
+
+## Setup
+
+```bash
+git clone https://github.com/huggingface/transformers && cd transformers
+pip install .
+pip install nltk torchtext # additional requirements.
+cd examples/pplm
+```
+
+## PPLM-BoW 
+
+### Example command for bag-of-words control
+
+```bash
+python run_pplm.py -B military --cond_text "The potato" --length 50 --gamma 1.5 --num_iterations 3 --num_samples 10 --stepsize 0.03 --window_length 5 --kl_scale 0.01 --gm_scale 0.99 --colorama --sample
+```
+
+### Tuning hyperparameters for bag-of-words control
+
+1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model. 
+
+2. If the language being generated is repetitive (For e.g. "science science experiment experiment"), there are several options to consider: </br>
+	a) Reduce the `--stepsize` </br>
+	b) Increase `--kl_scale` (the KL-loss coefficient) or decrease `--gm_scale` (the gm-scaling term) </br>
+	c) Add `--grad-length xx` where xx is an (integer <= length, e.g. `--grad-length 30`).</br>
+
+
+## PPLM-Discrim
+
+### Example command for discriminator based sentiment control
+
+```bash
+python run_pplm.py -D sentiment --class_label 2 --cond_text "My dog died" --length 50 --gamma 1.0 --num_iterations 10 --num_samples 10 --stepsize 0.04 --kl_scale 0.01 --gm_scale 0.95 --sample
+```
+
+### Tuning hyperparameters for discriminator control
+
+1. Increase `--stepsize` to intensify topic control, and decrease its value to soften the control. `--stepsize 0` recovers the original uncontrolled GPT-2 model. 
+
+2. Use `--class_label 3` for negative, and `--class_label 2` for positive
+
--- a/examples/pplm/imgs/headfigure.png
+++ b/examples/pplm/imgs/headfigure.png
--- a/examples/pplm/imgs/wooly.png
+++ b/examples/pplm/imgs/wooly.png
--- a/examples/pplm/pplm_classification_head.py
+++ b/examples/pplm/pplm_classification_head.py
@@ -0,0 +1,19 @@
+import torch
+
+
+class ClassificationHead(torch.nn.Module):
+    """Classification Head for  transformer encoders"""
+
+    def __init__(self, class_size, embed_size):
+        super().__init__()
+        self.class_size = class_size
+        self.embed_size = embed_size
+        # self.mlp1 = torch.nn.Linear(embed_size, embed_size)
+        # self.mlp2 = (torch.nn.Linear(embed_size, class_size))
+        self.mlp = torch.nn.Linear(embed_size, class_size)
+
+    def forward(self, hidden_state):
+        # hidden_state = F.relu(self.mlp1(hidden_state))
+        # hidden_state = self.mlp2(hidden_state)
+        logits = self.mlp(hidden_state)
+        return logits
--- a/examples/pplm/run_pplm.py
+++ b/examples/pplm/run_pplm.py
@@ -0,0 +1,794 @@
+#! /usr/bin/env python3
+# coding=utf-8
+
+# Copyright (c) 2019 Uber Technologies, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Example command with bag of words:
+python examples/run_pplm.py -B space --cond_text "The president" --length 100 --gamma 1.5 --num_iterations 3 --num_samples 10 --stepsize 0.01 --window_length 5 --kl_scale 0.01 --gm_scale 0.95
+
+Example command with discriminator:
+python examples/run_pplm.py -D sentiment --class_label 3 --cond_text "The lake" --length 10 --gamma 1.0 --num_iterations 30 --num_samples 10 --stepsize 0.01 --kl_scale 0.01 --gm_scale 0.95
+"""
+
+import argparse
+import json
+from operator import add
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.autograd import Variable
+from tqdm import trange
+
+from pplm_classification_head import ClassificationHead
+from transformers import GPT2Tokenizer
+from transformers.file_utils import cached_path
+from transformers.modeling_gpt2 import GPT2LMHeadModel
+
+
+PPLM_BOW = 1
+PPLM_DISCRIM = 2
+PPLM_BOW_DISCRIM = 3
+SMALL_CONST = 1e-15
+BIG_CONST = 1e10
+
+BAG_OF_WORDS_ARCHIVE_MAP = {
+    "legal": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt",
+    "military": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt",
+    "politics": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt",
+    "religion": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt",
+    "science": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt",
+    "space": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt",
+    "technology": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/technology.txt",
+}
+
+DISCRIMINATOR_MODELS_PARAMS = {
+    "clickbait": {
+        "url": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/discriminators/clickbait_classifier_head.pt",
+        "class_size": 2,
+        "embed_size": 1024,
+        "class_vocab": {"non_clickbait": 0, "clickbait": 1},
+        "default_class": 1,
+        "pretrained_model": "gpt2-medium",
+    },
+    "sentiment": {
+        "url": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/discriminators/SST_classifier_head.pt",
+        "class_size": 5,
+        "embed_size": 1024,
+        "class_vocab": {"very_positive": 2, "very_negative": 3},
+        "default_class": 3,
+        "pretrained_model": "gpt2-medium",
+    },
+}
+
+
+def to_var(x, requires_grad=False, volatile=False, device="cuda"):
+    if torch.cuda.is_available() and device == "cuda":
+        x = x.cuda()
+    elif device != "cuda":
+        x = x.to(device)
+    return Variable(x, requires_grad=requires_grad, volatile=volatile)
+
+
+def top_k_filter(logits, k, probs=False):
+    """
+    Masks everything but the k top entries as -infinity (1e10).
+    Used to mask logits such that e^-infinity -> 0 won't contribute to the
+    sum of the denominator.
+    """
+    if k == 0:
+        return logits
+    else:
+        values = torch.topk(logits, k)[0]
+        batch_mins = values[:, -1].view(-1, 1).expand_as(logits)
+        if probs:
+            return torch.where(logits < batch_mins, torch.ones_like(logits) * 0.0, logits)
+        return torch.where(logits < batch_mins, torch.ones_like(logits) * -BIG_CONST, logits)
+
+
+def perturb_past(
+    past,
+    model,
+    last,
+    unpert_past=None,
+    unpert_logits=None,
+    accumulated_hidden=None,
+    grad_norms=None,
+    stepsize=0.01,
+    one_hot_bows_vectors=None,
+    classifier=None,
+    class_label=None,
+    loss_type=0,
+    num_iterations=3,
+    horizon_length=1,
+    window_length=0,
+    decay=False,
+    gamma=1.5,
+    kl_scale=0.01,
+    device="cuda",
+):
+    # Generate inital perturbed past
+    grad_accumulator = [(np.zeros(p.shape).astype("float32")) for p in past]
+
+    if accumulated_hidden is None:
+        accumulated_hidden = 0
+
+    if decay:
+        decay_mask = torch.arange(0.0, 1.0 + SMALL_CONST, 1.0 / (window_length))[1:]
+    else:
+        decay_mask = 1.0
+
+    # TODO fix this comment (SUMANTH)
+    # Generate a mask is gradient perturbated is based on a past window
+    _, _, _, curr_length, _ = past[0].shape
+
+    if curr_length > window_length and window_length > 0:
+        ones_key_val_shape = tuple(past[0].shape[:-2]) + tuple([window_length]) + tuple(past[0].shape[-1:])
+
+        zeros_key_val_shape = (
+            tuple(past[0].shape[:-2]) + tuple([curr_length - window_length]) + tuple(past[0].shape[-1:])
+        )
+
+        ones_mask = torch.ones(ones_key_val_shape)
+        ones_mask = decay_mask * ones_mask.permute(0, 1, 2, 4, 3)
+        ones_mask = ones_mask.permute(0, 1, 2, 4, 3)
+
+        window_mask = torch.cat((ones_mask, torch.zeros(zeros_key_val_shape)), dim=-2).to(device)
+    else:
+        window_mask = torch.ones_like(past[0]).to(device)
+
+    # accumulate perturbations for num_iterations
+    loss_per_iter = []
+    new_accumulated_hidden = None
+    for i in range(num_iterations):
+        print("Iteration ", i + 1)
+        curr_perturbation = [
+            to_var(torch.from_numpy(p_), requires_grad=True, device=device) for p_ in grad_accumulator
+        ]
+
+        # Compute hidden using perturbed past
+        perturbed_past = list(map(add, past, curr_perturbation))
+        _, _, _, curr_length, _ = curr_perturbation[0].shape
+        all_logits, _, all_hidden = model(last, past=perturbed_past)
+        hidden = all_hidden[-1]
+        new_accumulated_hidden = accumulated_hidden + torch.sum(hidden, dim=1).detach()
+        # TODO: Check the layer-norm consistency of this with trained discriminator (Sumanth)
+        logits = all_logits[:, -1, :]
+        probs = F.softmax(logits, dim=-1)
+
+        loss = 0.0
+        loss_list = []
+        if loss_type == PPLM_BOW or loss_type == PPLM_BOW_DISCRIM:
+            for one_hot_bow in one_hot_bows_vectors:
+                bow_logits = torch.mm(probs, torch.t(one_hot_bow))
+                bow_loss = -torch.log(torch.sum(bow_logits))
+                loss += bow_loss
+                loss_list.append(bow_loss)
+            print(" pplm_bow_loss:", loss.data.cpu().numpy())
+
+        if loss_type == 2 or loss_type == 3:
+            ce_loss = torch.nn.CrossEntropyLoss()
+            # TODO why we need to do this assignment and not just using unpert_past? (Sumanth)
+            curr_unpert_past = unpert_past
+            curr_probs = torch.unsqueeze(probs, dim=1)
+            wte = model.resize_token_embeddings()
+            for _ in range(horizon_length):
+                inputs_embeds = torch.matmul(curr_probs, wte.weight.data)
+                _, curr_unpert_past, curr_all_hidden = model(past=curr_unpert_past, inputs_embeds=inputs_embeds)
+                curr_hidden = curr_all_hidden[-1]
+                new_accumulated_hidden = new_accumulated_hidden + torch.sum(curr_hidden, dim=1)
+
+            prediction = classifier(new_accumulated_hidden / (curr_length + 1 + horizon_length))
+
+            label = torch.tensor(prediction.shape[0] * [class_label], device=device, dtype=torch.long)
+            discrim_loss = ce_loss(prediction, label)
+            print(" pplm_discrim_loss:", discrim_loss.data.cpu().numpy())
+            loss += discrim_loss
+            loss_list.append(discrim_loss)
+
+        kl_loss = 0.0
+        if kl_scale > 0.0:
+            unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
+            unpert_probs = unpert_probs + SMALL_CONST * (unpert_probs <= SMALL_CONST).float().to(device).detach()
+            correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(device).detach()
+            corrected_probs = probs + correction.detach()
+            kl_loss = kl_scale * ((corrected_probs * (corrected_probs / unpert_probs).log()).sum())
+            print(" kl_loss", kl_loss.data.cpu().numpy())
+            loss += kl_loss
+
+        loss_per_iter.append(loss.data.cpu().numpy())
+        print(" pplm_loss", (loss - kl_loss).data.cpu().numpy())
+
+        # compute gradients
+        loss.backward()
+
+        # calculate gradient norms
+        if grad_norms is not None and loss_type == PPLM_BOW:
+            grad_norms = [
+                torch.max(grad_norms[index], torch.norm(p_.grad * window_mask))
+                for index, p_ in enumerate(curr_perturbation)
+            ]
+        else:
+            grad_norms = [
+                (torch.norm(p_.grad * window_mask) + SMALL_CONST) for index, p_ in enumerate(curr_perturbation)
+            ]
+
+        # normalize gradients
+        grad = [
+            -stepsize * (p_.grad * window_mask / grad_norms[index] ** gamma).data.cpu().numpy()
+            for index, p_ in enumerate(curr_perturbation)
+        ]
+
+        # accumulate gradient
+        grad_accumulator = list(map(add, grad, grad_accumulator))
+
+        # reset gradients, just to make sure
+        for p_ in curr_perturbation:
+            p_.grad.data.zero_()
+
+        # removing past from the graph
+        new_past = []
+        for p_ in past:
+            new_past.append(p_.detach())
+        past = new_past
+
+    # apply the accumulated perturbations to the past
+    grad_accumulator = [to_var(torch.from_numpy(p_), requires_grad=True, device=device) for p_ in grad_accumulator]
+    pert_past = list(map(add, past, grad_accumulator))
+
+    return pert_past, new_accumulated_hidden, grad_norms, loss_per_iter
+
+
+def get_classifier(
+    name: Optional[str], class_label: Union[str, int], device: str
+) -> Tuple[Optional[ClassificationHead], Optional[int]]:
+    if name is None:
+        return None, None
+
+    params = DISCRIMINATOR_MODELS_PARAMS[name]
+    classifier = ClassificationHead(class_size=params["class_size"], embed_size=params["embed_size"]).to(device)
+    if "url" in params:
+        resolved_archive_file = cached_path(params["url"])
+    elif "path" in params:
+        resolved_archive_file = params["path"]
+    else:
+        raise ValueError("Either url or path have to be specified " "in the discriminator model parameters")
+    classifier.load_state_dict(torch.load(resolved_archive_file, map_location=device))
+    classifier.eval()
+
+    if isinstance(class_label, str):
+        if class_label in params["class_vocab"]:
+            label_id = params["class_vocab"][class_label]
+        else:
+            label_id = params["default_class"]
+            print("class_label {} not in class_vocab".format(class_label))
+            print("available values are: {}".format(params["class_vocab"]))
+            print("using default class {}".format(label_id))
+
+    elif isinstance(class_label, int):
+        if class_label in set(params["class_vocab"].values()):
+            label_id = class_label
+        else:
+            label_id = params["default_class"]
+            print("class_label {} not in class_vocab".format(class_label))
+            print("available values are: {}".format(params["class_vocab"]))
+            print("using default class {}".format(label_id))
+
+    else:
+        label_id = params["default_class"]
+
+    return classifier, label_id
+
+
+def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> List[List[List[int]]]:
+    bow_indices = []
+    for id_or_path in bag_of_words_ids_or_paths:
+        if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP:
+            filepath = cached_path(BAG_OF_WORDS_ARCHIVE_MAP[id_or_path])
+        else:
+            filepath = id_or_path
+        with open(filepath, "r") as f:
+            words = f.read().strip().split("\n")
+        bow_indices.append([tokenizer.encode(word.strip(), add_prefix_space=True) for word in words])
+    return bow_indices
+
+
+def build_bows_one_hot_vectors(bow_indices, tokenizer, device="cuda"):
+    if bow_indices is None:
+        return None
+
+    one_hot_bows_vectors = []
+    for single_bow in bow_indices:
+        single_bow = list(filter(lambda x: len(x) <= 1, single_bow))
+        single_bow = torch.tensor(single_bow).to(device)
+        num_words = single_bow.shape[0]
+        one_hot_bow = torch.zeros(num_words, tokenizer.vocab_size).to(device)
+        one_hot_bow.scatter_(1, single_bow, 1)
+        one_hot_bows_vectors.append(one_hot_bow)
+    return one_hot_bows_vectors
+
+
+def full_text_generation(
+    model,
+    tokenizer,
+    context=None,
+    num_samples=1,
+    device="cuda",
+    bag_of_words=None,
+    discrim=None,
+    class_label=None,
+    length=100,
+    stepsize=0.02,
+    temperature=1.0,
+    top_k=10,
+    sample=False,
+    num_iterations=3,
+    grad_length=10000,
+    horizon_length=1,
+    window_length=0,
+    decay=False,
+    gamma=1.5,
+    gm_scale=0.9,
+    kl_scale=0.01,
+    repetition_penalty=1.0,
+    **kwargs
+):
+    classifier, class_id = get_classifier(discrim, class_label, device)
+
+    bow_indices = []
+    if bag_of_words:
+        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer)
+
+    if bag_of_words and classifier:
+        print("Both PPLM-BoW and PPLM-Discrim are on. This is not optimized.")
+        loss_type = PPLM_BOW_DISCRIM
+
+    elif bag_of_words:
+        loss_type = PPLM_BOW
+        print("Using PPLM-BoW")
+
+    elif classifier is not None:
+        loss_type = PPLM_DISCRIM
+        print("Using PPLM-Discrim")
+
+    else:
+        raise Exception("Specify either a bag of words or a discriminator")
+
+    unpert_gen_tok_text, _, _ = generate_text_pplm(
+        model=model,
+        tokenizer=tokenizer,
+        context=context,
+        device=device,
+        length=length,
+        sample=sample,
+        perturb=False,
+        repetition_penalty=repetition_penalty,
+    )
+    if device == "cuda":
+        torch.cuda.empty_cache()
+
+    pert_gen_tok_texts = []
+    discrim_losses = []
+    losses_in_time = []
+
+    for i in range(num_samples):
+        pert_gen_tok_text, discrim_loss, loss_in_time = generate_text_pplm(
+            model=model,
+            tokenizer=tokenizer,
+            context=context,
+            device=device,
+            perturb=True,
+            bow_indices=bow_indices,
+            classifier=classifier,
+            class_label=class_id,
+            loss_type=loss_type,
+            length=length,
+            stepsize=stepsize,
+            temperature=temperature,
+            top_k=top_k,
+            sample=sample,
+            num_iterations=num_iterations,
+            grad_length=grad_length,
+            horizon_length=horizon_length,
+            window_length=window_length,
+            decay=decay,
+            gamma=gamma,
+            gm_scale=gm_scale,
+            kl_scale=kl_scale,
+            repetition_penalty=repetition_penalty,
+        )
+        pert_gen_tok_texts.append(pert_gen_tok_text)
+        if classifier is not None:
+            discrim_losses.append(discrim_loss.data.cpu().numpy())
+        losses_in_time.append(loss_in_time)
+
+    if device == "cuda":
+        torch.cuda.empty_cache()
+
+    return unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
+
+
+def generate_text_pplm(
+    model,
+    tokenizer,
+    context=None,
+    past=None,
+    device="cuda",
+    perturb=True,
+    bow_indices=None,
+    classifier=None,
+    class_label=None,
+    loss_type=0,
+    length=100,
+    stepsize=0.02,
+    temperature=1.0,
+    top_k=10,
+    sample=False,
+    num_iterations=3,
+    grad_length=10000,
+    horizon_length=1,
+    window_length=0,
+    decay=False,
+    gamma=1.5,
+    gm_scale=0.9,
+    kl_scale=0.01,
+    repetition_penalty=1.0,
+):
+    output_so_far = None
+    if context:
+        context_t = torch.tensor(context, device=device, dtype=torch.long)
+        while len(context_t.shape) < 2:
+            context_t = context_t.unsqueeze(0)
+        output_so_far = context_t
+
+    # collect one hot vectors for bags of words
+    one_hot_bows_vectors = build_bows_one_hot_vectors(bow_indices, tokenizer, device)
+
+    grad_norms = None
+    last = None
+    unpert_discrim_loss = 0
+    loss_in_time = []
+    for i in trange(length, ascii=True):
+
+        # Get past/probs for current output, except for last word
+        # Note that GPT takes 2 inputs: past + current_token
+
+        # run model forward to obtain unperturbed
+        if past is None and output_so_far is not None:
+            last = output_so_far[:, -1:]
+            if output_so_far.shape[1] > 1:
+                _, past, _ = model(output_so_far[:, :-1])
+
+        unpert_logits, unpert_past, unpert_all_hidden = model(output_so_far)
+        unpert_last_hidden = unpert_all_hidden[-1]
+
+        # check if we are abowe grad max length
+        if i >= grad_length:
+            current_stepsize = stepsize * 0
+        else:
+            current_stepsize = stepsize
+
+        # modify the past if necessary
+        if not perturb or num_iterations == 0:
+            pert_past = past
+
+        else:
+            accumulated_hidden = unpert_last_hidden[:, :-1, :]
+            accumulated_hidden = torch.sum(accumulated_hidden, dim=1)
+
+            if past is not None:
+                pert_past, _, grad_norms, loss_this_iter = perturb_past(
+                    past,
+                    model,
+                    last,
+                    unpert_past=unpert_past,
+                    unpert_logits=unpert_logits,
+                    accumulated_hidden=accumulated_hidden,
+                    grad_norms=grad_norms,
+                    stepsize=current_stepsize,
+                    one_hot_bows_vectors=one_hot_bows_vectors,
+                    classifier=classifier,
+                    class_label=class_label,
+                    loss_type=loss_type,
+                    num_iterations=num_iterations,
+                    horizon_length=horizon_length,
+                    window_length=window_length,
+                    decay=decay,
+                    gamma=gamma,
+                    kl_scale=kl_scale,
+                    device=device,
+                )
+                loss_in_time.append(loss_this_iter)
+            else:
+                pert_past = past
+
+        pert_logits, past, pert_all_hidden = model(last, past=pert_past)
+        pert_logits = pert_logits[:, -1, :] / temperature  # + SMALL_CONST
+
+        for token_idx in set(output_so_far[0].tolist()):
+            if pert_logits[0, token_idx] < 0:
+                pert_logits[0, token_idx] *= repetition_penalty
+            else:
+                pert_logits[0, token_idx] /= repetition_penalty
+
+        pert_probs = F.softmax(pert_logits, dim=-1)
+
+        if classifier is not None:
+            ce_loss = torch.nn.CrossEntropyLoss()
+            prediction = classifier(torch.mean(unpert_last_hidden, dim=1))
+            label = torch.tensor([class_label], device=device, dtype=torch.long)
+            unpert_discrim_loss = ce_loss(prediction, label)
+            print("unperturbed discrim loss", unpert_discrim_loss.data.cpu().numpy())
+        else:
+            unpert_discrim_loss = 0
+
+        # Fuse the modified model and original model
+        if perturb:
+
+            unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
+
+            pert_probs = (pert_probs ** gm_scale) * (unpert_probs ** (1 - gm_scale))  # + SMALL_CONST
+            pert_probs = top_k_filter(pert_probs, k=top_k, probs=True)  # + SMALL_CONST
+
+            # rescale
+            if torch.sum(pert_probs) <= 1:
+                pert_probs = pert_probs / torch.sum(pert_probs)
+
+        else:
+            pert_logits = top_k_filter(pert_logits, k=top_k)  # + SMALL_CONST
+            pert_probs = F.softmax(pert_logits, dim=-1)
+
+        # sample or greedy
+        if sample:
+            last = torch.multinomial(pert_probs, num_samples=1)
+
+        else:
+            _, last = torch.topk(pert_probs, k=1, dim=-1)
+
+        # update context/output_so_far appending the new token
+        output_so_far = last if output_so_far is None else torch.cat((output_so_far, last), dim=1)
+
+        print(tokenizer.decode(output_so_far.tolist()[0]))
+
+    return output_so_far, unpert_discrim_loss, loss_in_time
+
+
+def set_generic_model_params(discrim_weights, discrim_meta):
+    if discrim_weights is None:
+        raise ValueError("When using a generic discriminator, " "discrim_weights need to be specified")
+    if discrim_meta is None:
+        raise ValueError("When using a generic discriminator, " "discrim_meta need to be specified")
+
+    with open(discrim_meta, "r") as discrim_meta_file:
+        meta = json.load(discrim_meta_file)
+    meta["path"] = discrim_weights
+    DISCRIMINATOR_MODELS_PARAMS["generic"] = meta
+
+
+def run_pplm_example(
+    pretrained_model="gpt2-medium",
+    cond_text="",
+    uncond=False,
+    num_samples=1,
+    bag_of_words=None,
+    discrim=None,
+    discrim_weights=None,
+    discrim_meta=None,
+    class_label=-1,
+    length=100,
+    stepsize=0.02,
+    temperature=1.0,
+    top_k=10,
+    sample=False,
+    num_iterations=3,
+    grad_length=10000,
+    horizon_length=1,
+    window_length=0,
+    decay=False,
+    gamma=1.5,
+    gm_scale=0.9,
+    kl_scale=0.01,
+    seed=0,
+    no_cuda=False,
+    colorama=False,
+    repetition_penalty=1.0,
+):
+    # set Random seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+
+    # set the device
+    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
+
+    if discrim == "generic":
+        set_generic_model_params(discrim_weights, discrim_meta)
+
+    if discrim is not None:
+        pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim]["pretrained_model"]
+        print("discrim = {}, pretrained_model set " "to discriminator's = {}".format(discrim, pretrained_model))
+
+    # load pretrained model
+    model = GPT2LMHeadModel.from_pretrained(pretrained_model, output_hidden_states=True)
+    model.to(device)
+    model.eval()
+
+    # load tokenizer
+    tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
+
+    # Freeze GPT-2 weights
+    for param in model.parameters():
+        param.requires_grad = False
+
+    # figure out conditioning text
+    if uncond:
+        tokenized_cond_text = tokenizer.encode([tokenizer.bos_token])
+    else:
+        raw_text = cond_text
+        while not raw_text:
+            print("Did you forget to add `--cond_text`? ")
+            raw_text = input("Model prompt >>> ")
+        tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text)
+
+    print("= Prefix of sentence =")
+    print(tokenizer.decode(tokenized_cond_text))
+    print()
+
+    # generate unperturbed and perturbed texts
+
+    # full_text_generation returns:
+    # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
+    unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation(
+        model=model,
+        tokenizer=tokenizer,
+        context=tokenized_cond_text,
+        device=device,
+        num_samples=num_samples,
+        bag_of_words=bag_of_words,
+        discrim=discrim,
+        class_label=class_label,
+        length=length,
+        stepsize=stepsize,
+        temperature=temperature,
+        top_k=top_k,
+        sample=sample,
+        num_iterations=num_iterations,
+        grad_length=grad_length,
+        horizon_length=horizon_length,
+        window_length=window_length,
+        decay=decay,
+        gamma=gamma,
+        gm_scale=gm_scale,
+        kl_scale=kl_scale,
+        repetition_penalty=repetition_penalty,
+    )
+
+    # untokenize unperturbed text
+    unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0])
+
+    print("=" * 80)
+    print("= Unperturbed generated text =")
+    print(unpert_gen_text)
+    print()
+
+    generated_texts = []
+
+    bow_word_ids = set()
+    if bag_of_words and colorama:
+        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer)
+        for single_bow_list in bow_indices:
+            # filtering all words in the list composed of more than 1 token
+            filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
+            # w[0] because we are sure w has only 1 item because previous fitler
+            bow_word_ids.update(w[0] for w in filtered)
+
+    # iterate through the perturbed texts
+    for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts):
+        try:
+            # untokenize unperturbed text
+            if colorama:
+                import colorama
+
+                pert_gen_text = ""
+                for word_id in pert_gen_tok_text.tolist()[0]:
+                    if word_id in bow_word_ids:
+                        pert_gen_text += "{}{}{}".format(
+                            colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL
+                        )
+                    else:
+                        pert_gen_text += tokenizer.decode([word_id])
+            else:
+                pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0])
+
+            print("= Perturbed generated text {} =".format(i + 1))
+            print(pert_gen_text)
+            print()
+        except Exception as exc:
+            print("Ignoring error while generating perturbed text:", exc)
+
+        # keep the prefix, perturbed seq, original seq for each index
+        generated_texts.append((tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text))
+
+    return
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model",
+        "-M",
+        type=str,
+        default="gpt2-medium",
+        help="pretrained model name or path to local checkpoint",
+    )
+    parser.add_argument("--cond_text", type=str, default="The lake", help="Prefix texts to condition on")
+    parser.add_argument("--uncond", action="store_true", help="Generate from end-of-text as prefix")
+    parser.add_argument(
+        "--num_samples", type=int, default=1, help="Number of samples to generate from the modified latents",
+    )
+    parser.add_argument(
+        "--bag_of_words",
+        "-B",
+        type=str,
+        default=None,
+        help="Bags of words used for PPLM-BoW. "
+        "Either a BOW id (see list in code) or a filepath. "
+        "Multiple BoWs separated by ;",
+    )
+    parser.add_argument(
+        "--discrim",
+        "-D",
+        type=str,
+        default=None,
+        choices=("clickbait", "sentiment", "toxicity", "generic"),
+        help="Discriminator to use",
+    )
+    parser.add_argument("--discrim_weights", type=str, default=None, help="Weights for the generic discriminator")
+    parser.add_argument(
+        "--discrim_meta", type=str, default=None, help="Meta information for the generic discriminator"
+    )
+    parser.add_argument(
+        "--class_label", type=int, default=-1, help="Class label used for the discriminator",
+    )
+    parser.add_argument("--length", type=int, default=100)
+    parser.add_argument("--stepsize", type=float, default=0.02)
+    parser.add_argument("--temperature", type=float, default=1.0)
+    parser.add_argument("--top_k", type=int, default=10)
+    parser.add_argument("--sample", action="store_true", help="Generate from end-of-text as prefix")
+    parser.add_argument("--num_iterations", type=int, default=3)
+    parser.add_argument("--grad_length", type=int, default=10000)
+    parser.add_argument(
+        "--window_length",
+        type=int,
+        default=0,
+        help="Length of past which is being optimized; " "0 corresponds to infinite window length",
+    )
+    parser.add_argument(
+        "--horizon_length", type=int, default=1, help="Length of future to optimize over",
+    )
+    parser.add_argument("--decay", action="store_true", help="whether to decay or not")
+    parser.add_argument("--gamma", type=float, default=1.5)
+    parser.add_argument("--gm_scale", type=float, default=0.9)
+    parser.add_argument("--kl_scale", type=float, default=0.01)
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--no_cuda", action="store_true", help="no cuda")
+    parser.add_argument("--colorama", action="store_true", help="colors keywords")
+    parser.add_argument(
+        "--repetition_penalty", type=float, default=1.0, help="Penalize repetition. More than 1.0 -> less repetition",
+    )
+
+    args = parser.parse_args()
+    run_pplm_example(**vars(args))
--- a/examples/pplm/run_pplm_discrim_train.py
+++ b/examples/pplm/run_pplm_discrim_train.py
@@ -0,0 +1,517 @@
+#! /usr/bin/env python3
+# coding=utf-8
+
+# Copyright (c) 2019 Uber Technologies, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import csv
+import json
+import math
+import time
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.utils.data as data
+from nltk.tokenize.treebank import TreebankWordDetokenizer
+from torchtext import data as torchtext_data
+from torchtext import datasets
+from tqdm import tqdm, trange
+
+from pplm_classification_head import ClassificationHead
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
+
+
+torch.manual_seed(0)
+np.random.seed(0)
+EPSILON = 1e-10
+example_sentence = "This is incredible! I love it, this is the best chicken I have ever had."
+max_length_seq = 100
+
+
+class Discriminator(torch.nn.Module):
+    """Transformer encoder followed by a Classification Head"""
+
+    def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"):
+        super().__init__()
+        self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
+        self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
+        self.embed_size = self.encoder.transformer.config.hidden_size
+        self.classifier_head = ClassificationHead(class_size=class_size, embed_size=self.embed_size)
+        self.cached_mode = cached_mode
+        self.device = device
+
+    def get_classifier(self):
+        return self.classifier_head
+
+    def train_custom(self):
+        for param in self.encoder.parameters():
+            param.requires_grad = False
+        self.classifier_head.train()
+
+    def avg_representation(self, x):
+        mask = x.ne(0).unsqueeze(2).repeat(1, 1, self.embed_size).float().to(self.device).detach()
+        hidden, _ = self.encoder.transformer(x)
+        masked_hidden = hidden * mask
+        avg_hidden = torch.sum(masked_hidden, dim=1) / (torch.sum(mask, dim=1).detach() + EPSILON)
+        return avg_hidden
+
+    def forward(self, x):
+        if self.cached_mode:
+            avg_hidden = x.to(self.device)
+        else:
+            avg_hidden = self.avg_representation(x.to(self.device))
+
+        logits = self.classifier_head(avg_hidden)
+        probs = F.log_softmax(logits, dim=-1)
+
+        return probs
+
+
+class Dataset(data.Dataset):
+    def __init__(self, X, y):
+        """Reads source and target sequences from txt files."""
+        self.X = X
+        self.y = y
+
+    def __len__(self):
+        return len(self.X)
+
+    def __getitem__(self, index):
+        """Returns one data pair (source and target)."""
+        data = {}
+        data["X"] = self.X[index]
+        data["y"] = self.y[index]
+        return data
+
+
+def collate_fn(data):
+    def pad_sequences(sequences):
+        lengths = [len(seq) for seq in sequences]
+
+        padded_sequences = torch.zeros(len(sequences), max(lengths)).long()  # padding value = 0
+
+        for i, seq in enumerate(sequences):
+            end = lengths[i]
+            padded_sequences[i, :end] = seq[:end]
+
+        return padded_sequences, lengths
+
+    item_info = {}
+    for key in data[0].keys():
+        item_info[key] = [d[key] for d in data]
+
+    x_batch, _ = pad_sequences(item_info["X"])
+    y_batch = torch.tensor(item_info["y"], dtype=torch.long)
+
+    return x_batch, y_batch
+
+
+def cached_collate_fn(data):
+    item_info = {}
+    for key in data[0].keys():
+        item_info[key] = [d[key] for d in data]
+
+    x_batch = torch.cat(item_info["X"], 0)
+    y_batch = torch.tensor(item_info["y"], dtype=torch.long)
+
+    return x_batch, y_batch
+
+
+def train_epoch(data_loader, discriminator, optimizer, epoch=0, log_interval=10, device="cpu"):
+    samples_so_far = 0
+    discriminator.train_custom()
+    for batch_idx, (input_t, target_t) in enumerate(data_loader):
+        input_t, target_t = input_t.to(device), target_t.to(device)
+
+        optimizer.zero_grad()
+
+        output_t = discriminator(input_t)
+        loss = F.nll_loss(output_t, target_t)
+        loss.backward(retain_graph=True)
+        optimizer.step()
+
+        samples_so_far += len(input_t)
+
+        if batch_idx % log_interval == 0:
+            print(
+                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                    epoch + 1,
+                    samples_so_far,
+                    len(data_loader.dataset),
+                    100 * samples_so_far / len(data_loader.dataset),
+                    loss.item(),
+                )
+            )
+
+
+def evaluate_performance(data_loader, discriminator, device="cpu"):
+    discriminator.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for input_t, target_t in data_loader:
+            input_t, target_t = input_t.to(device), target_t.to(device)
+            output_t = discriminator(input_t)
+            # sum up batch loss
+            test_loss += F.nll_loss(output_t, target_t, reduction="sum").item()
+            # get the index of the max log-probability
+            pred_t = output_t.argmax(dim=1, keepdim=True)
+            correct += pred_t.eq(target_t.view_as(pred_t)).sum().item()
+
+    test_loss /= len(data_loader.dataset)
+
+    print(
+        "Performance on test set: "
+        "Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)".format(
+            test_loss, correct, len(data_loader.dataset), 100.0 * correct / len(data_loader.dataset)
+        )
+    )
+
+
+def predict(input_sentence, model, classes, cached=False, device="cpu"):
+    input_t = model.tokenizer.encode(input_sentence)
+    input_t = torch.tensor([input_t], dtype=torch.long, device=device)
+    if cached:
+        input_t = model.avg_representation(input_t)
+
+    log_probs = model(input_t).data.cpu().numpy().flatten().tolist()
+    print("Input sentence:", input_sentence)
+    print(
+        "Predictions:",
+        ", ".join("{}: {:.4f}".format(c, math.exp(log_prob)) for c, log_prob in zip(classes, log_probs)),
+    )
+
+
+def get_cached_data_loader(dataset, batch_size, discriminator, shuffle=False, device="cpu"):
+    data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=collate_fn)
+
+    xs = []
+    ys = []
+    for batch_idx, (x, y) in enumerate(tqdm(data_loader, ascii=True)):
+        with torch.no_grad():
+            x = x.to(device)
+            avg_rep = discriminator.avg_representation(x).cpu().detach()
+            avg_rep_list = torch.unbind(avg_rep.unsqueeze(1))
+            xs += avg_rep_list
+            ys += y.cpu().numpy().tolist()
+
+    data_loader = torch.utils.data.DataLoader(
+        dataset=Dataset(xs, ys), batch_size=batch_size, shuffle=shuffle, collate_fn=cached_collate_fn
+    )
+
+    return data_loader
+
+
+def train_discriminator(
+    dataset,
+    dataset_fp=None,
+    pretrained_model="gpt2-medium",
+    epochs=10,
+    batch_size=64,
+    log_interval=10,
+    save_model=False,
+    cached=False,
+    no_cuda=False,
+):
+    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
+
+    print("Preprocessing {} dataset...".format(dataset))
+    start = time.time()
+
+    if dataset == "SST":
+        idx2class = ["positive", "negative", "very positive", "very negative", "neutral"]
+        class2idx = {c: i for i, c in enumerate(idx2class)}
+
+        discriminator = Discriminator(
+            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
+        ).to(device)
+
+        text = torchtext_data.Field()
+        label = torchtext_data.Field(sequential=False)
+        train_data, val_data, test_data = datasets.SST.splits(text, label, fine_grained=True, train_subtrees=True,)
+
+        x = []
+        y = []
+        for i in trange(len(train_data), ascii=True):
+            seq = TreebankWordDetokenizer().detokenize(vars(train_data[i])["text"])
+            seq = discriminator.tokenizer.encode(seq)
+            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
+            x.append(seq)
+            y.append(class2idx[vars(train_data[i])["label"]])
+        train_dataset = Dataset(x, y)
+
+        test_x = []
+        test_y = []
+        for i in trange(len(test_data), ascii=True):
+            seq = TreebankWordDetokenizer().detokenize(vars(test_data[i])["text"])
+            seq = discriminator.tokenizer.encode(seq)
+            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
+            test_x.append(seq)
+            test_y.append(class2idx[vars(test_data[i])["label"]])
+        test_dataset = Dataset(test_x, test_y)
+
+        discriminator_meta = {
+            "class_size": len(idx2class),
+            "embed_size": discriminator.embed_size,
+            "pretrained_model": pretrained_model,
+            "class_vocab": class2idx,
+            "default_class": 2,
+        }
+
+    elif dataset == "clickbait":
+        idx2class = ["non_clickbait", "clickbait"]
+        class2idx = {c: i for i, c in enumerate(idx2class)}
+
+        discriminator = Discriminator(
+            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
+        ).to(device)
+
+        with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
+            data = []
+            for i, line in enumerate(f):
+                try:
+                    data.append(eval(line))
+                except Exception:
+                    print("Error evaluating line {}: {}".format(i, line))
+                    continue
+        x = []
+        y = []
+        with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
+            for i, line in enumerate(tqdm(f, ascii=True)):
+                try:
+                    d = eval(line)
+                    seq = discriminator.tokenizer.encode(d["text"])
+
+                    if len(seq) < max_length_seq:
+                        seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
+                    else:
+                        print("Line {} is longer than maximum length {}".format(i, max_length_seq))
+                        continue
+                    x.append(seq)
+                    y.append(d["label"])
+                except Exception:
+                    print("Error evaluating / tokenizing" " line {}, skipping it".format(i))
+                    pass
+
+        full_dataset = Dataset(x, y)
+        train_size = int(0.9 * len(full_dataset))
+        test_size = len(full_dataset) - train_size
+        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
+
+        discriminator_meta = {
+            "class_size": len(idx2class),
+            "embed_size": discriminator.embed_size,
+            "pretrained_model": pretrained_model,
+            "class_vocab": class2idx,
+            "default_class": 1,
+        }
+
+    elif dataset == "toxic":
+        idx2class = ["non_toxic", "toxic"]
+        class2idx = {c: i for i, c in enumerate(idx2class)}
+
+        discriminator = Discriminator(
+            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
+        ).to(device)
+
+        x = []
+        y = []
+        with open("datasets/toxic/toxic_train.txt") as f:
+            for i, line in enumerate(tqdm(f, ascii=True)):
+                try:
+                    d = eval(line)
+                    seq = discriminator.tokenizer.encode(d["text"])
+
+                    if len(seq) < max_length_seq:
+                        seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
+                    else:
+                        print("Line {} is longer than maximum length {}".format(i, max_length_seq))
+                        continue
+                    x.append(seq)
+                    y.append(int(np.sum(d["label"]) > 0))
+                except Exception:
+                    print("Error evaluating / tokenizing" " line {}, skipping it".format(i))
+                    pass
+
+        full_dataset = Dataset(x, y)
+        train_size = int(0.9 * len(full_dataset))
+        test_size = len(full_dataset) - train_size
+        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
+
+        discriminator_meta = {
+            "class_size": len(idx2class),
+            "embed_size": discriminator.embed_size,
+            "pretrained_model": pretrained_model,
+            "class_vocab": class2idx,
+            "default_class": 0,
+        }
+
+    else:  # if dataset == "generic":
+        # This assumes the input dataset is a TSV with the following structure:
+        # class \t text
+
+        if dataset_fp is None:
+            raise ValueError("When generic dataset is selected, " "dataset_fp needs to be specified aswell.")
+
+        classes = set()
+        with open(dataset_fp) as f:
+            csv_reader = csv.reader(f, delimiter="\t")
+            for row in tqdm(csv_reader, ascii=True):
+                if row:
+                    classes.add(row[0])
+
+        idx2class = sorted(classes)
+        class2idx = {c: i for i, c in enumerate(idx2class)}
+
+        discriminator = Discriminator(
+            class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device
+        ).to(device)
+
+        x = []
+        y = []
+        with open(dataset_fp) as f:
+            csv_reader = csv.reader(f, delimiter="\t")
+            for i, row in enumerate(tqdm(csv_reader, ascii=True)):
+                if row:
+                    label = row[0]
+                    text = row[1]
+
+                    try:
+                        seq = discriminator.tokenizer.encode(text)
+                        if len(seq) < max_length_seq:
+                            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
+
+                        else:
+                            print("Line {} is longer than maximum length {}".format(i, max_length_seq))
+                            continue
+
+                        x.append(seq)
+                        y.append(class2idx[label])
+
+                    except Exception:
+                        print("Error tokenizing line {}, skipping it".format(i))
+                        pass
+
+        full_dataset = Dataset(x, y)
+        train_size = int(0.9 * len(full_dataset))
+        test_size = len(full_dataset) - train_size
+        train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
+
+        discriminator_meta = {
+            "class_size": len(idx2class),
+            "embed_size": discriminator.embed_size,
+            "pretrained_model": pretrained_model,
+            "class_vocab": class2idx,
+            "default_class": 0,
+        }
+
+    end = time.time()
+    print("Preprocessed {} data points".format(len(train_dataset) + len(test_dataset)))
+    print("Data preprocessing took: {:.3f}s".format(end - start))
+
+    if cached:
+        print("Building representation cache...")
+
+        start = time.time()
+
+        train_loader = get_cached_data_loader(train_dataset, batch_size, discriminator, shuffle=True, device=device)
+
+        test_loader = get_cached_data_loader(test_dataset, batch_size, discriminator, device=device)
+
+        end = time.time()
+        print("Building representation cache took: {:.3f}s".format(end - start))
+
+    else:
+        train_loader = torch.utils.data.DataLoader(
+            dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
+        )
+        test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, collate_fn=collate_fn)
+
+    if save_model:
+        with open("{}_classifier_head_meta.json".format(dataset), "w") as meta_file:
+            json.dump(discriminator_meta, meta_file)
+
+    optimizer = optim.Adam(discriminator.parameters(), lr=0.0001)
+
+    for epoch in range(epochs):
+        start = time.time()
+        print("\nEpoch", epoch + 1)
+
+        train_epoch(
+            discriminator=discriminator,
+            data_loader=train_loader,
+            optimizer=optimizer,
+            epoch=epoch,
+            log_interval=log_interval,
+            device=device,
+        )
+        evaluate_performance(data_loader=test_loader, discriminator=discriminator, device=device)
+
+        end = time.time()
+        print("Epoch took: {:.3f}s".format(end - start))
+
+        print("\nExample prediction")
+        predict(example_sentence, discriminator, idx2class, cached=cached, device=device)
+
+        if save_model:
+            # torch.save(discriminator.state_dict(),
+            #           "{}_discriminator_{}.pt".format(
+            #               args.dataset, epoch + 1
+            #               ))
+            torch.save(
+                discriminator.get_classifier().state_dict(),
+                "{}_classifier_head_epoch_{}.pt".format(dataset, epoch + 1),
+            )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Train a discriminator on top of GPT-2 representations")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="SST",
+        choices=("SST", "clickbait", "toxic", "generic"),
+        help="dataset to train the discriminator on."
+        "In case of generic, the dataset is expected"
+        "to be a TSBV file with structure: class \\t text",
+    )
+    parser.add_argument(
+        "--dataset_fp",
+        type=str,
+        default="",
+        help="File path of the dataset to use. " "Needed only in case of generic datadset",
+    )
+    parser.add_argument(
+        "--pretrained_model", type=str, default="gpt2-medium", help="Pretrained model to use as encoder"
+    )
+    parser.add_argument("--epochs", type=int, default=10, metavar="N", help="Number of training epochs")
+    parser.add_argument(
+        "--batch_size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)"
+    )
+    parser.add_argument(
+        "--log_interval",
+        type=int,
+        default=10,
+        metavar="N",
+        help="how many batches to wait before logging training status",
+    )
+    parser.add_argument("--save_model", action="store_true", help="whether to save the model")
+    parser.add_argument("--cached", action="store_true", help="whether to cache the input representations")
+    parser.add_argument("--no_cuda", action="store_true", help="use to turn off cuda")
+    args = parser.parse_args()
+
+    train_discriminator(**(vars(args)))
--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@@ -1,2 +1,4 @@
 tensorboardX
-scikit-learn
+tensorboard
+scikit-learn
+seqeval
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -19,28 +19,22 @@
    Some parts of this script are adapted from the code of Michel et al. (http://arxiv.org/abs/1905.10650)
    which is available at https://github.com/pmichel31415/are-16-heads-really-better-than-1
 """
-import os
 import argparse
 import logging
-from datetime import timedelta, datetime
-from tqdm import tqdm
+import os
+from datetime import datetime

 import numpy as np
-
 import torch
-from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subset
+from torch.utils.data import DataLoader, SequentialSampler, Subset
 from torch.utils.data.distributed import DistributedSampler
-from torch.nn import CrossEntropyLoss, MSELoss
+from tqdm import tqdm

-from transformers import (WEIGHTS_NAME,
-                                  BertConfig, BertForSequenceClassification, BertTokenizer,
-                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer,
-                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
+from run_glue import ALL_MODELS, MODEL_CLASSES, load_and_cache_examples, set_seed
+from transformers import glue_compute_metrics as compute_metrics
+from transformers import glue_output_modes as output_modes
+from transformers import glue_processors as processors

-from run_glue import set_seed, load_and_cache_examples, ALL_MODELS, MODEL_CLASSES
-
-from utils_glue import (compute_metrics, convert_examples_to_features,
-                        output_modes, processors)

 logger = logging.getLogger(__name__)

@@ -62,7 +56,9 @@ def print_2d_tensor(tensor):
            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data))


-def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None):
+def compute_heads_importance(
+    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None
+):
    """ This method shows how to compute:
        - head attention entropy
        - head importance scores according to http://arxiv.org/abs/1905.10650
@@ -84,8 +80,14 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
        input_ids, input_mask, segment_ids, label_ids = batch

        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
-        outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask)
-        loss, logits, all_attentions = outputs[0], outputs[1], outputs[-1]  # Loss and logits are the first, attention the last
+        outputs = model(
+            input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask
+        )
+        loss, logits, all_attentions = (
+            outputs[0],
+            outputs[1],
+            outputs[-1],
+        )  # Loss and logits are the first, attention the last
        loss.backward()  # Backpropagate to populate the gradients in the head mask

        if compute_entropy:
@@ -112,15 +114,15 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
    # Layerwise importance normalization
    if not args.dont_normalize_importance_by_layer:
        exponent = 2
-        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1/exponent)
+        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1 / exponent)
        head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20

    if not args.dont_normalize_global_importance:
        head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())

    # Print/save matrices
-    np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy())
-    np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, "attn_entropy.npy"), attn_entropy.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, "head_importance.npy"), head_importance.detach().cpu().numpy())

    logger.info("Attention entropies")
    print_2d_tensor(attn_entropy)
@@ -128,7 +130,9 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True,
    print_2d_tensor(head_importance)
    logger.info("Head ranked by importance scores")
    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
-    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device)
+    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(
+        head_importance.numel(), device=args.device
+    )
    head_ranks = head_ranks.view_as(head_importance)
    print_2d_tensor(head_ranks)

@@ -149,9 +153,9 @@ def mask_heads(args, model, eval_dataloader):

    current_score = original_score
    while current_score >= original_score * args.masking_threshold:
-        head_mask = new_head_mask.clone() # save current head mask
+        head_mask = new_head_mask.clone()  # save current head mask
        # heads from least important to most - keep only not-masked heads
-        head_importance[head_mask == 0.0] = float('Inf')
+        head_importance[head_mask == 0.0] = float("Inf")
        current_heads_to_mask = head_importance.view(-1).sort()[1]

        if len(current_heads_to_mask) <= num_to_mask:
@@ -166,14 +170,21 @@ def mask_heads(args, model, eval_dataloader):
        print_2d_tensor(new_head_mask)

        # Compute metric and head importance again
-        _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask)
+        _, head_importance, preds, labels = compute_heads_importance(
+            args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask
+        )
        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
        current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
-        logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100)
+        logger.info(
+            "Masking: current score: %f, remaning heads %d (%.1f percents)",
+            current_score,
+            new_head_mask.sum(),
+            new_head_mask.sum() / new_head_mask.numel() * 100,
+        )

    logger.info("Final head mask")
    print_2d_tensor(head_mask)
-    np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, "head_mask.npy"), head_mask.detach().cpu().numpy())

    return head_mask

@@ -185,8 +196,9 @@ def prune_heads(args, model, eval_dataloader, head_mask):
    # Try pruning and test time speedup
    # Pruning is like masking but we actually remove the masked weights
    before_time = datetime.now()
-    _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
-                                                   compute_entropy=False, compute_importance=False, head_mask=head_mask)
+    _, _, preds, labels = compute_heads_importance(
+        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask
+    )
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
    score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name]
    original_time = datetime.now() - before_time
@@ -198,71 +210,127 @@ def prune_heads(args, model, eval_dataloader, head_mask):
    pruned_num_params = sum(p.numel() for p in model.parameters())

    before_time = datetime.now()
-    _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
-                                                    compute_entropy=False, compute_importance=False, head_mask=None)
+    _, _, preds, labels = compute_heads_importance(
+        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=None
+    )
    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
    score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name]
    new_time = datetime.now() - before_time

-    logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100)
+    logger.info(
+        "Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)",
+        original_num_params,
+        pruned_num_params,
+        pruned_num_params / original_num_params * 100,
+    )
    logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
-    logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100)
+    logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time / new_time * 100)


 def main():
    parser = argparse.ArgumentParser()
-    ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
-                            ALL_MODELS))
-    parser.add_argument("--task_name", default=None, type=str, required=True,
-                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    # Required parameters
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )

-    ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name_or_path")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name_or_path")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--data_subset", type=int, default=-1,
-                        help="If > 0: limit the data to a subset of data_subset instances.")
-    parser.add_argument("--overwrite_output_dir", action='store_true',
-                        help="Whether to overwrite data in output directory")
+    # Other parameters
+    parser.add_argument(
+        "--config_name",
+        default="",
+        type=str,
+        help="Pretrained config name or path if not the same as model_name_or_path",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name_or_path",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances."
+    )
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )

-    parser.add_argument("--dont_normalize_importance_by_layer", action='store_true',
-                        help="Don't normalize importance score by layers")
-    parser.add_argument("--dont_normalize_global_importance", action='store_true',
-                        help="Don't normalize all importance scores between 0 and 1")
+    parser.add_argument(
+        "--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers"
+    )
+    parser.add_argument(
+        "--dont_normalize_global_importance",
+        action="store_true",
+        help="Don't normalize all importance scores between 0 and 1",
+    )

-    parser.add_argument("--try_masking", action='store_true',
-                        help="Whether to try to mask head until a threshold of accuracy.")
-    parser.add_argument("--masking_threshold", default=0.9, type=float,
-                        help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).")
-    parser.add_argument("--masking_amount", default=0.1, type=float,
-                        help="Amount to heads to masking at each masking step.")
-    parser.add_argument("--metric_name", default="acc", type=str,
-                        help="Metric to use for head masking.")
+    parser.add_argument(
+        "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy."
+    )
+    parser.add_argument(
+        "--masking_threshold",
+        default=0.9,
+        type=float,
+        help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).",
+    )
+    parser.add_argument(
+        "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step."
+    )
+    parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.")

-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, sequences shorter padded.")
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. \n"
+        "Sequences longer than this will be truncated, sequences shorter padded.",
+    )
    parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")

    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -275,10 +343,10 @@ def main():
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        args.n_gpu = 1
-        torch.distributed.init_process_group(backend='nccl')  # Initializes the distributed backend
+        torch.distributed.init_process_group(backend="nccl")  # Initializes the distributed backend

    # Setup logging
-    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1)))

    # Set seeds
@@ -303,11 +371,23 @@ def main():
            args.model_type = key  # take the first match in model types
            break
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels, finetuning_task=args.task_name,
-                                          output_attentions=True)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path)
-    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        output_attentions=True,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -315,14 +395,14 @@ def main():
    # Distributed and parallel training
    model.to(args.device)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Print/save training arguments
-    torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))
+    torch.save(args, os.path.join(args.output_dir, "run_args.bin"))
    logger.info("Training/evaluation parameters %s", args)

    # Prepare dataset for the GLUE task
@@ -332,11 +412,9 @@ def main():
    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)

-
    # Compute head entropy and importance score
    compute_heads_importance(args, model, eval_dataloader)

-
    # Try head masking (set heads to zero until the score goes under a threshole)
    # and head pruning (remove masked heads and see the effect on the network)
    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
@@ -344,5 +422,5 @@ def main():
        prune_heads(args, model, eval_dataloader, head_mask)


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -16,42 +16,44 @@
 # limitations under the License.
 """ Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
 """
-from __future__ import absolute_import, division, print_function, unicode_literals
+

 import argparse
 import logging
-from tqdm import trange

-import torch
-import torch.nn.functional as F
 import numpy as np
+import torch

-from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig, CTRLConfig
-
-from transformers import GPT2LMHeadModel, GPT2Tokenizer
-from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
-from transformers import XLNetLMHeadModel, XLNetTokenizer
-from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
-from transformers import CTRLLMHeadModel, CTRLTokenizer
-from transformers import XLMWithLMHeadModel, XLMTokenizer
+from transformers import (
+    CTRLLMHeadModel,
+    CTRLTokenizer,
+    GPT2LMHeadModel,
+    GPT2Tokenizer,
+    OpenAIGPTLMHeadModel,
+    OpenAIGPTTokenizer,
+    TransfoXLLMHeadModel,
+    TransfoXLTokenizer,
+    XLMTokenizer,
+    XLMWithLMHeadModel,
+    XLNetLMHeadModel,
+    XLNetTokenizer,
+)


-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO,
+)
 logger = logging.getLogger(__name__)

 MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig, CTRLConfig)), ())
-
 MODEL_CLASSES = {
-    'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
-    'ctrl': (CTRLLMHeadModel, CTRLTokenizer),
-    'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
-    'xlnet': (XLNetLMHeadModel, XLNetTokenizer),
-    'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer),
-    'xlm': (XLMWithLMHeadModel, XLMTokenizer),
+    "gpt2": (GPT2LMHeadModel, GPT2Tokenizer),
+    "ctrl": (CTRLLMHeadModel, CTRLTokenizer),
+    "openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
+    "xlnet": (XLNetLMHeadModel, XLNetTokenizer),
+    "transfo-xl": (TransfoXLLMHeadModel, TransfoXLTokenizer),
+    "xlm": (XLMWithLMHeadModel, XLMTokenizer),
 }

 # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
@@ -76,161 +78,161 @@ def set_seed(args):
        torch.cuda.manual_seed_all(args.seed)


-def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
-    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
-        Args:
-            logits: logits distribution shape (vocabulary size)
-            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
-            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
-                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
-        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
-    """
-    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
-    top_k = min(top_k, logits.size(-1))  # Safety check
-    if top_k > 0:
-        # Remove all tokens with a probability less than the last token of the top-k
-        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
-        logits[indices_to_remove] = filter_value
-
-    if top_p > 0.0:
-        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
-
-        # Remove tokens with cumulative probability above the threshold
-        sorted_indices_to_remove = cumulative_probs > top_p
-        # Shift the indices to the right to keep also the first token above the threshold
-        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-        sorted_indices_to_remove[..., 0] = 0
-
-        indices_to_remove = sorted_indices[sorted_indices_to_remove]
-        logits[indices_to_remove] = filter_value
-    return logits
+#
+# Functions to prepare models' input
+#


-def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0, is_xlnet=False, xlm_lang=None, device='cpu'):
-    context = torch.tensor(context, dtype=torch.long, device=device)
-    context = context.unsqueeze(0).repeat(num_samples, 1)
-    generated = context
-    with torch.no_grad():
-        for _ in trange(length):
+def prepare_ctrl_input(args, _, tokenizer, prompt_text):
+    if args.temperature > 0.7:
+        logger.info("CTRL typically works better with lower temperatures (and lower top_k).")

-            inputs = {'input_ids': generated}
-            if is_xlnet: 
-                # XLNet is a direct (predict same token, not next token) and bi-directional model by default
-                # => need one additional dummy token in the input (will be masked), attention mask and target mapping (see model docstring)
-                input_ids = torch.cat((generated, torch.zeros((1, 1), dtype=torch.long, device=device)), dim=1)
-                perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float, device=device)
-                perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-                target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float, device=device)
-                target_mapping[0, 0, -1] = 1.0  # predict last token
-                inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}
+    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False)
+    if not any(encoded_prompt[0] == x for x in tokenizer.control_codes.values()):
+        logger.info("WARNING! You are not starting your generation from a control code so you won't get good results")
+    return prompt_text

-            if xlm_lang is not None:
-                inputs["langs"] = torch.tensor([xlm_lang] * inputs["input_ids"].shape[1], device=device).view(1, -1)

-            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
-            next_token_logits = outputs[0][0, -1, :] / (temperature if temperature > 0 else 1.)
+def prepare_xlm_input(args, model, tokenizer, prompt_text):
+    # kwargs = {"language": None, "mask_token_id": None}

-            # reptition penalty from CTRL (https://arxiv.org/abs/1909.05858)
-            for _ in set(generated):
-                next_token_logits[_] /= repetition_penalty
-                
-            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
-            if temperature == 0: #greedy sampling:
-                next_token = torch.argmax(filtered_logits).unsqueeze(0)
-            else:
-                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
-            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
-    return generated
+    # Set the language
+    use_lang_emb = hasattr(model.config, "use_lang_emb") and model.config.use_lang_emb
+    if hasattr(model.config, "lang2id") and use_lang_emb:
+        available_languages = model.config.lang2id.keys()
+        if args.xlm_language in available_languages:
+            language = args.xlm_language
+        else:
+            language = None
+            while language not in available_languages:
+                language = input("Using XLM. Select language in " + str(list(available_languages)) + " >>> ")
+        # kwargs["language"] = tokenizer.lang2id[language]
+
+    # TODO fix mask_token_id setup when configurations will be synchronized between models and tokenizers
+    # XLM masked-language modeling (MLM) models need masked token
+    # is_xlm_mlm = "mlm" in args.model_name_or_path
+    # if is_xlm_mlm:
+    #     kwargs["mask_token_id"] = tokenizer.mask_token_id
+
+    return prompt_text
+
+
+def prepare_xlnet_input(args, _, tokenizer, prompt_text):
+    prompt_text = (args.padding_text if args.padding_text else PADDING_TEXT) + prompt_text
+    return prompt_text, {}
+
+
+def prepare_transfoxl_input(args, _, tokenizer, prompt_text):
+    prompt_text = (args.padding_text if args.padding_text else PADDING_TEXT) + prompt_text
+    return prompt_text, {}
+
+
+PREPROCESSING_FUNCTIONS = {
+    "ctrl": prepare_ctrl_input,
+    "xlm": prepare_xlm_input,
+    "xlnet": prepare_xlnet_input,
+    "transfo-xl": prepare_transfoxl_input,
+}
+
+
+def adjust_length_to_model(length, max_sequence_length):
+    if length < 0 and max_sequence_length > 0:
+        length = max_sequence_length
+    elif 0 < max_sequence_length < length:
+        length = max_sequence_length  # No generation bigger than model size
+    elif length < 0:
+        length = MAX_LENGTH  # avoid infinite loop
+    return length


 def main():
    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+
    parser.add_argument("--prompt", type=str, default="")
-    parser.add_argument("--padding_text", type=str, default="")
-    parser.add_argument("--xlm_lang", type=str, default="", help="Optional language when used with the XLM model.")
    parser.add_argument("--length", type=int, default=20)
-    parser.add_argument("--temperature", type=float, default=1.0,
-                        help="temperature of 0 implies greedy sampling")
-    parser.add_argument("--repetition_penalty", type=float, default=1.0,
-                        help="primarily useful for CTRL model; in that case, use 1.2")
-    parser.add_argument("--top_k", type=int, default=0)
-    parser.add_argument("--top_p", type=float, default=0.9)
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
-    parser.add_argument('--stop_token', type=str, default=None,
-                        help="Token at which text generation is stopped")
+    parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")
+
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
+    )
+    parser.add_argument(
+        "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2"
+    )
+    parser.add_argument("--k", type=int, default=0)
+    parser.add_argument("--p", type=float, default=0.9)
+
+    parser.add_argument("--padding_text", type=str, default="", help="Padding text for Transfo-XL and XLNet.")
+    parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.")
+
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    args = parser.parse_args()
-    if args.model_type in ["ctrl"]:
-        if args.temperature > 0.7 : 
-            print('CTRL typically works better with lower temperatures (and lower top_k).')
-        
+
    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()

    set_seed(args)

-    args.model_type = args.model_type.lower()
-    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    # Initialize the model and tokenizer
+    try:
+        args.model_type = args.model_type.lower()
+        model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    except KeyError:
+        raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")
+
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    model = model_class.from_pretrained(args.model_name_or_path)
    model.to(args.device)
-    model.eval()

-    if args.length < 0 and model.config.max_position_embeddings > 0:
-        args.length = model.config.max_position_embeddings
-    elif 0 < model.config.max_position_embeddings < args.length:
-        args.length = model.config.max_position_embeddings  # No generation bigger than model size 
-    elif args.length < 0:
-        args.length = MAX_LENGTH  # avoid infinite loop
+    args.length = adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings)
+    logger.info(args)

-    print(args)
-    while True:
-        xlm_lang = None
-        # XLM Language usage detailed in the issues #1414
-        if args.model_type in ["xlm"] and hasattr(tokenizer, 'lang2id') and hasattr(model.config, 'use_lang_emb') \
-                and model.config.use_lang_emb:
-            if args.xlm_lang:
-                language = args.xlm_lang
-            else:
-                language = None
-                while language not in tokenizer.lang2id.keys():
-                    language = input("Using XLM. Select language in " + str(list(tokenizer.lang2id.keys())) + " >>> ")
-            xlm_lang = tokenizer.lang2id[language]
+    prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")

-        raw_text = args.prompt if args.prompt else input("Model prompt >>> ")
-        if args.model_type in ["transfo-xl", "xlnet"]:
-            # Models with memory likes to have a long prompt for short inputs.
-            raw_text = (args.padding_text if args.padding_text else PADDING_TEXT) + raw_text
-        context_tokens = tokenizer.encode(raw_text)
-        out = sample_sequence(
-            model=model,
-            context=context_tokens,
-            length=args.length,
-            temperature=args.temperature,
-            top_k=args.top_k,
-            top_p=args.top_p,
-            repetition_penalty=args.repetition_penalty,
-            is_xlnet=bool(args.model_type == "xlnet"),
-            xlm_lang=xlm_lang,
-            device=args.device,
-        )
-        out = out[0, len(context_tokens):].tolist()
+    # Different models need different input formatting and/or extra arguments
+    requires_preprocessing = args.model_type in PREPROCESSING_FUNCTIONS.keys()
+    if requires_preprocessing:
+        prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type)
+        prompt_text = prepare_input(args, model, tokenizer, prompt_text)
+    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
+    encoded_prompt = encoded_prompt.to(args.device)

-        text = tokenizer.decode(out, clean_up_tokenization_spaces=True, skip_special_tokens=True)
-        text = text[: text.find(args.stop_token) if args.stop_token else None]
+    output_sequences = model.generate(
+        input_ids=encoded_prompt,
+        max_length=args.length,
+        temperature=args.temperature,
+        top_k=args.k,
+        top_p=args.p,
+        repetition_penalty=args.repetition_penalty,
+        do_sample=True,
+    )
+
+    # Batch size == 1. to add more examples please use num_return_sequences > 1
+    generated_sequence = output_sequences[0].tolist()
+    text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
+    text = text[: text.find(args.stop_token) if args.stop_token else None]
+
+    print(text)

-        print(text)
-        if args.prompt:
-            break
    return text


-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -13,55 +13,91 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""
+""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa, Albert, XLM-RoBERTa)."""

-from __future__ import absolute_import, division, print_function

 import argparse
 import glob
+import json
 import logging
 import os
 import random

 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
-from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange

-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForSequenceClassification, BertTokenizer,
-                                  RobertaConfig,
-                                  RobertaForSequenceClassification,
-                                  RobertaTokenizer,
-                                  XLMConfig, XLMForSequenceClassification,
-                                  XLMTokenizer, XLNetConfig,
-                                  XLNetForSequenceClassification,
-                                  XLNetTokenizer,
-                                  DistilBertConfig,
-                                  DistilBertForSequenceClassification,
-                                  DistilBertTokenizer)
-
-from transformers import AdamW, WarmupLinearSchedule
-
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    AlbertConfig,
+    AlbertForSequenceClassification,
+    AlbertTokenizer,
+    BertConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+    DistilBertConfig,
+    DistilBertForSequenceClassification,
+    DistilBertTokenizer,
+    FlaubertConfig,
+    FlaubertForSequenceClassification,
+    FlaubertTokenizer,
+    RobertaConfig,
+    RobertaForSequenceClassification,
+    RobertaTokenizer,
+    XLMConfig,
+    XLMForSequenceClassification,
+    XLMRobertaConfig,
+    XLMRobertaForSequenceClassification,
+    XLMRobertaTokenizer,
+    XLMTokenizer,
+    XLNetConfig,
+    XLNetForSequenceClassification,
+    XLNetTokenizer,
+    get_linear_schedule_with_warmup,
+)
 from transformers import glue_compute_metrics as compute_metrics
+from transformers import glue_convert_examples_to_features as convert_examples_to_features
 from transformers import glue_output_modes as output_modes
 from transformers import glue_processors as processors
-from transformers import glue_convert_examples_to_features as convert_examples_to_features
+
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    from tensorboardX import SummaryWriter
+

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, 
-                                                                                RobertaConfig, DistilBertConfig)), ())
+ALL_MODELS = sum(
+    (
+        tuple(conf.pretrained_config_archive_map.keys())
+        for conf in (
+            BertConfig,
+            XLNetConfig,
+            XLMConfig,
+            RobertaConfig,
+            DistilBertConfig,
+            AlbertConfig,
+            XLMRobertaConfig,
+            FlaubertConfig,
+        )
+    ),
+    (),
+)

 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
-    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
-    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
+    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
+    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
+    "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
+    "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
+    "xlmroberta": (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
+    "flaubert": (FlaubertConfig, FlaubertForSequenceClassification, FlaubertTokenizer),
 }


@@ -89,13 +125,28 @@ def train(args, train_dataset, model, tokenizer):
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
+
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
+
+    # Check if saved optimizer or scheduler states exist
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
+        # Load in optimizer and scheduler states
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
+
    if args.fp16:
        try:
            from apex import amp
@@ -109,78 +160,125 @@ def train(args, train_dataset, model, tokenizer):

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
+        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
+    epochs_trained = 0
+    steps_trained_in_current_epoch = 0
+    # Check if continuing training from a checkpoint
+    if os.path.exists(args.model_name_or_path):
+        # set global_step to gobal_step of last saved checkpoint from model path
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
+        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
+        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
+
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+        logger.info("  Continuing training from epoch %d", epochs_trained)
+        logger.info("  Continuing training from global step %d", global_step)
+        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
+
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
-    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
-    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
+    )
+    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
+
+            # Skip past any already trained steps if resuming training
+            if steps_trained_in_current_epoch > 0:
+                steps_trained_in_current_epoch -= 1
+                continue
+
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      'labels':         batch[3]}
-            if args.model_type != 'distilbert':
-                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None
+                )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
-                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
            else:
                loss.backward()
-                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    logs = {}
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                            eval_key = "eval_{}".format(key)
+                            logs[eval_key] = value
+
+                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
+                    learning_rate_scalar = scheduler.get_lr()[0]
+                    logs["learning_rate"] = learning_rate_scalar
+                    logs["loss"] = loss_scalar
                    logging_loss = tr_loss

+                    for key, value in logs.items():
+                        tb_writer.add_scalar(key, value, global_step)
+                    print(json.dumps({**logs, **{"step": global_step}}))
+
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    tokenizer.save_pretrained(output_dir)
+
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
+                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
+
            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
@@ -197,7 +295,7 @@ def train(args, train_dataset, model, tokenizer):
 def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
-    eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
+    eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)

    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
@@ -208,9 +306,13 @@ def evaluate(args, model, tokenizer, prefix=""):

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

+        # multi-gpu eval
+        if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
+            model = torch.nn.DataParallel(model)
+
        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
@@ -224,11 +326,11 @@ def evaluate(args, model, tokenizer, prefix=""):
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
-                inputs = {'input_ids':      batch[0],
-                          'attention_mask': batch[1],
-                          'labels':         batch[3]}
-                if args.model_type != 'distilbert':
-                    inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+                if args.model_type != "distilbert":
+                    inputs["token_type_ids"] = (
+                        batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None
+                    )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

@@ -236,10 +338,10 @@ def evaluate(args, model, tokenizer, prefix=""):
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs['labels'].detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        if args.output_mode == "classification":
@@ -266,29 +368,36 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
-        'dev' if evaluate else 'train',
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length),
-        str(task)))
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+            str(task),
+        ),
+    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
-        if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
+        if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta", "xlmroberta"]:
            # HACK(label indices are swapped in RoBERTa pretrained model)
-            label_list[1], label_list[2] = label_list[2], label_list[1] 
-        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        features = convert_examples_to_features(examples,
-                                                tokenizer,
-                                                label_list=label_list,
-                                                max_length=args.max_seq_length,
-                                                output_mode=output_mode,
-                                                pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-                                                pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
+            label_list[1], label_list[2] = label_list[2], label_list[1]
+        examples = (
+            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+        )
+        features = convert_examples_to_features(
+            examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
+            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -313,91 +422,152 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
 def main():
    parser = argparse.ArgumentParser()

-    ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--task_name", default=None, type=str, required=True,
-                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    # Required parameters
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )

-    ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Rul evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    # Other parameters
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
+    )

-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument(
+        "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
+    )
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args()

-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -409,16 +579,24 @@ def main():
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )

    # Set seed
    set_seed(args)
@@ -438,9 +616,23 @@ def main():

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -449,14 +641,12 @@ def main():

    logger.info("Training/evaluation parameters %s", args)

-
    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

-
    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
@@ -466,36 +656,39 @@ def main():
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

-
    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results
--- a/examples/run_language_modeling.py
+++ b/examples/run_language_modeling.py
@@ -0,0 +1,799 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
+GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
+using a masked language modeling (MLM) loss.
+"""
+
+
+import argparse
+import glob
+import logging
+import os
+import pickle
+import random
+import re
+import shutil
+from typing import Dict, List, Tuple
+
+import numpy as np
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    BertConfig,
+    BertForMaskedLM,
+    BertTokenizer,
+    CamembertConfig,
+    CamembertForMaskedLM,
+    CamembertTokenizer,
+    DistilBertConfig,
+    DistilBertForMaskedLM,
+    DistilBertTokenizer,
+    GPT2Config,
+    GPT2LMHeadModel,
+    GPT2Tokenizer,
+    OpenAIGPTConfig,
+    OpenAIGPTLMHeadModel,
+    OpenAIGPTTokenizer,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    RobertaConfig,
+    RobertaForMaskedLM,
+    RobertaTokenizer,
+    get_linear_schedule_with_warmup,
+)
+
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    from tensorboardX import SummaryWriter
+
+
+logger = logging.getLogger(__name__)
+
+
+MODEL_CLASSES = {
+    "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
+    "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
+    "bert": (BertConfig, BertForMaskedLM, BertTokenizer),
+    "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
+    "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),
+}
+
+
+class TextDataset(Dataset):
+    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
+        assert os.path.isfile(file_path)
+
+        block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)
+
+        directory, filename = os.path.split(file_path)
+        cached_features_file = os.path.join(
+            directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + filename
+        )
+
+        if os.path.exists(cached_features_file) and not args.overwrite_cache:
+            logger.info("Loading features from cached file %s", cached_features_file)
+            with open(cached_features_file, "rb") as handle:
+                self.examples = pickle.load(handle)
+        else:
+            logger.info("Creating features from dataset file at %s", directory)
+
+            self.examples = []
+            with open(file_path, encoding="utf-8") as f:
+                text = f.read()
+
+            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
+
+            for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
+                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))
+            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
+            # If your dataset is small, first you should loook for a bigger one :-) and second you
+            # can change this behavior by adding (model specific) padding.
+
+            logger.info("Saving features into cached file %s", cached_features_file)
+            with open(cached_features_file, "wb") as handle:
+                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, item):
+        return torch.tensor(self.examples[item], dtype=torch.long)
+
+
+class LineByLineTextDataset(Dataset):
+    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
+        assert os.path.isfile(file_path)
+        # Here, we do not cache the features, operating under the assumption
+        # that we will soon use fast multithreaded tokenizers from the
+        # `tokenizers` repo everywhere =)
+        logger.info("Creating features from dataset file at %s", file_path)
+
+        with open(file_path, encoding="utf-8") as f:
+            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
+
+        self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i):
+        return torch.tensor(self.examples[i], dtype=torch.long)
+
+
+def load_and_cache_examples(args, tokenizer, evaluate=False):
+    file_path = args.eval_data_file if evaluate else args.train_data_file
+    if args.line_by_line:
+        return LineByLineTextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size)
+    else:
+        return TextDataset(tokenizer, args, file_path=file_path, block_size=args.block_size)
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
+    ordering_and_checkpoint_path = []
+
+    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))
+
+    for path in glob_checkpoints:
+        if use_mtime:
+            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
+        else:
+            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
+            if regex_match and regex_match.groups():
+                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
+
+    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+    return checkpoints_sorted
+
+
+def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
+    if not args.save_total_limit:
+        return
+    if args.save_total_limit <= 0:
+        return
+
+    # Check if we should delete older checkpoint(s)
+    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
+    if len(checkpoints_sorted) <= args.save_total_limit:
+        return
+
+    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
+    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
+    for checkpoint in checkpoints_to_be_deleted:
+        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
+        shutil.rmtree(checkpoint)
+
+
+def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]:
+    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
+
+    if tokenizer.mask_token is None:
+        raise ValueError(
+            "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
+        )
+
+    labels = inputs.clone()
+    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+    probability_matrix = torch.full(labels.shape, args.mlm_probability)
+    special_tokens_mask = [
+        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+    ]
+    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
+    if tokenizer._pad_token is not None:
+        padding_mask = labels.eq(tokenizer.pad_token_id)
+        probability_matrix.masked_fill_(padding_mask, value=0.0)
+    masked_indices = torch.bernoulli(probability_matrix).bool()
+    labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
+
+    # 10% of the time, we replace masked input tokens with random word
+    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
+    inputs[indices_random] = random_words[indices_random]
+
+    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+    return inputs, labels
+
+
+def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+
+    def collate(examples: List[torch.Tensor]):
+        if tokenizer._pad_token is None:
+            return pad_sequence(examples, batch_first=True)
+        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
+
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(
+        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate
+    )
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
+
+    # Check if saved optimizer or scheduler states exist
+    if (
+        args.model_name_or_path
+        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
+        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
+    ):
+        # Load in optimizer and scheduler states
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
+
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    epochs_trained = 0
+    steps_trained_in_current_epoch = 0
+    # Check if continuing training from a checkpoint
+    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
+        try:
+            # set global_step to gobal_step of last saved checkpoint from model path
+            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
+            global_step = int(checkpoint_suffix)
+            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
+            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
+
+            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+            logger.info("  Continuing training from epoch %d", epochs_trained)
+            logger.info("  Continuing training from global step %d", global_step)
+            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
+        except ValueError:
+            logger.info("  Starting fine-tuning.")
+
+    tr_loss, logging_loss = 0.0, 0.0
+
+    model_to_resize = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
+    model_to_resize.resize_token_embeddings(len(tokenizer))
+
+    model.zero_grad()
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
+    set_seed(args)  # Added here for reproducibility
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+
+            # Skip past any already trained steps if resuming training
+            if steps_trained_in_current_epoch > 0:
+                steps_trained_in_current_epoch -= 1
+                continue
+
+            inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
+            inputs = inputs.to(args.device)
+            labels = labels.to(args.device)
+            model.train()
+            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+
+            if args.n_gpu > 1:
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    checkpoint_prefix = "checkpoint"
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
+                    os.makedirs(output_dir, exist_ok=True)
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    tokenizer.save_pretrained(output_dir)
+
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+                    _rotate_checkpoints(args, checkpoint_prefix)
+
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
+                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict:
+    # Loop to handle MNLI double evaluation (matched, mis-matched)
+    eval_output_dir = args.output_dir
+
+    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
+
+    if args.local_rank in [-1, 0]:
+        os.makedirs(eval_output_dir, exist_ok=True)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    # Note that DistributedSampler samples randomly
+
+    def collate(examples: List[torch.Tensor]):
+        if tokenizer._pad_token is None:
+            return pad_sequence(examples, batch_first=True)
+        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
+
+    eval_sampler = SequentialSampler(eval_dataset)
+    eval_dataloader = DataLoader(
+        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate
+    )
+
+    # multi-gpu evaluate
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Eval!
+    logger.info("***** Running evaluation {} *****".format(prefix))
+    logger.info("  Num examples = %d", len(eval_dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    eval_loss = 0.0
+    nb_eval_steps = 0
+    model.eval()
+
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
+        inputs = inputs.to(args.device)
+        labels = labels.to(args.device)
+
+        with torch.no_grad():
+            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
+            lm_loss = outputs[0]
+            eval_loss += lm_loss.mean().item()
+        nb_eval_steps += 1
+
+    eval_loss = eval_loss / nb_eval_steps
+    perplexity = torch.exp(torch.tensor(eval_loss))
+
+    result = {"perplexity": perplexity}
+
+    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
+    with open(output_eval_file, "w") as writer:
+        logger.info("***** Eval results {} *****".format(prefix))
+        for key in sorted(result.keys()):
+            logger.info("  %s = %s", key, str(result[key]))
+            writer.write("%s = %s\n" % (key, str(result[key])))
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)."
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--model_type", type=str, required=True, help="The model architecture to be trained or fine-tuned.",
+    )
+
+    # Other parameters
+    parser.add_argument(
+        "--eval_data_file",
+        default=None,
+        type=str,
+        help="An optional input evaluation data file to evaluate the perplexity on (a text file).",
+    )
+    parser.add_argument(
+        "--line_by_line",
+        action="store_true",
+        help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.",
+    )
+    parser.add_argument(
+        "--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir"
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        help="The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.",
+    )
+
+    parser.add_argument(
+        "--mlm", action="store_true", help="Train with masked-language modeling loss instead of language modeling."
+    )
+    parser.add_argument(
+        "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss"
+    )
+
+    parser.add_argument(
+        "--config_name",
+        default=None,
+        type=str,
+        help="Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default=None,
+        type=str,
+        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default=None,
+        type=str,
+        help="Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)",
+    )
+    parser.add_argument(
+        "--block_size",
+        default=-1,
+        type=int,
+        help="Optional input sequence length after tokenization."
+        "The training dataset will be truncated in block of this size for training."
+        "Default to the model max input length for single sentence inputs (take into account special tokens).",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+
+    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--save_total_limit",
+        type=int,
+        default=None,
+        help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
+    )
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
+    args = parser.parse_args()
+
+    if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm:
+        raise ValueError(
+            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
+            "flag (masked language modeling)."
+        )
+    if args.eval_data_file is None and args.do_eval:
+        raise ValueError(
+            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
+            "or remove the --do_eval argument."
+        )
+    if args.should_continue:
+        sorted_checkpoints = _sorted_checkpoints(args)
+        if len(sorted_checkpoints) == 0:
+            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
+        else:
+            args.model_name_or_path = sorted_checkpoints[-1]
+
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend="nccl")
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
+
+    # Set seed
+    set_seed(args)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab
+
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+
+    if args.config_name:
+        config = config_class.from_pretrained(args.config_name, cache_dir=args.cache_dir)
+    elif args.model_name_or_path:
+        config = config_class.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
+    else:
+        config = config_class()
+
+    if args.tokenizer_name:
+        tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
+    elif args.model_name_or_path:
+        tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
+    else:
+        raise ValueError(
+            "You are instantiating a new {} tokenizer. This is not supported, but you can do it from another script, save it,"
+            "and load it from here, using --tokenizer_name".format(tokenizer_class.__name__)
+        )
+
+    if args.block_size <= 0:
+        args.block_size = tokenizer.max_len
+        # Our input block size will be the max possible for the model
+    else:
+        args.block_size = min(args.block_size, tokenizer.max_len)
+
+    if args.model_name_or_path:
+        model = model_class.from_pretrained(
+            args.model_name_or_path,
+            from_tf=bool(".ckpt" in args.model_name_or_path),
+            config=config,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = model_class(config=config)
+
+    model.to(args.device)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # End of barrier to make sure only the first process in distributed training download model & vocab
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Training
+    if args.do_train:
+        if args.local_rank not in [-1, 0]:
+            torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)
+
+        if args.local_rank == 0:
+            torch.distributed.barrier()
+
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+
+    # Evaluation
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, prefix=prefix)
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
+            results.update(result)
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -1,538 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
-GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
-using a masked language modeling (MLM) loss.
-"""
-
-from __future__ import absolute_import, division, print_function
-
-import argparse
-import glob
-import logging
-import os
-import pickle
-import random
-import re
-import shutil
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
-from torch.utils.data.distributed import DistributedSampler
-from tensorboardX import SummaryWriter
-from tqdm import tqdm, trange
-
-from transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
-                                  BertConfig, BertForMaskedLM, BertTokenizer,
-                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
-                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
-                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
-                                  DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
-
-
-logger = logging.getLogger(__name__)
-
-
-MODEL_CLASSES = {
-    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
-    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
-    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
-    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
-}
-
-
-class TextDataset(Dataset):
-    def __init__(self, tokenizer, file_path='train', block_size=512):
-        assert os.path.isfile(file_path)
-        directory, filename = os.path.split(file_path)
-        cached_features_file = os.path.join(directory, 'cached_lm_' + block_size + '_' + filename)
-
-        if os.path.exists(cached_features_file):
-            logger.info("Loading features from cached file %s", cached_features_file)
-            with open(cached_features_file, 'rb') as handle:
-                self.examples = pickle.load(handle)
-        else:
-            logger.info("Creating features from dataset file at %s", directory)
-
-            self.examples = []
-            with open(file_path, encoding="utf-8") as f:
-                text = f.read()
-
-            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
-
-            for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
-                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size]))
-            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
-            # If your dataset is small, first you should loook for a bigger one :-) and second you
-            # can change this behavior by adding (model specific) padding.
-
-            logger.info("Saving features into cached file %s", cached_features_file)
-            with open(cached_features_file, 'wb') as handle:
-                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
-
-    def __len__(self):
-        return len(self.examples)
-
-    def __getitem__(self, item):
-        return torch.tensor(self.examples[item])
-
-
-def load_and_cache_examples(args, tokenizer, evaluate=False):
-    dataset = TextDataset(tokenizer, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
-    return dataset
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
-    if not args.save_total_limit:
-        return
-    if args.save_total_limit <= 0:
-        return
-
-    # Check if we should delete older checkpoint(s)
-    glob_checkpoints = glob.glob(os.path.join(args.output_dir, '{}-*'.format(checkpoint_prefix)))
-    if len(glob_checkpoints) <= args.save_total_limit:
-        return
-
-    ordering_and_checkpoint_path = []
-    for path in glob_checkpoints:
-        if use_mtime:
-            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
-        else:
-            regex_match = re.match('.*{}-([0-9]+)'.format(checkpoint_prefix), path)
-            if regex_match and regex_match.groups():
-                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
-
-    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
-    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
-    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
-    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
-    for checkpoint in checkpoints_to_be_deleted:
-        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
-        shutil.rmtree(checkpoint)
-
-
-def mask_tokens(inputs, tokenizer, args):
-    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
-    labels = inputs.clone()
-    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
-    probability_matrix = torch.full(labels.shape, args.mlm_probability)
-    special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
-    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
-    masked_indices = torch.bernoulli(probability_matrix).bool()
-    labels[~masked_indices] = -1  # We only compute loss on masked tokens
-
-    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
-    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
-    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
-
-    # 10% of the time, we replace masked input tokens with random word
-    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
-    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
-    inputs[indices_random] = random_words[indices_random]
-
-    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
-    return inputs, labels
-
-
-def train(args, train_dataset, model, tokenizer):
-    """ Train the model """
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
-    optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
-    set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-            inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
-            inputs = inputs.to(args.device)
-            labels = labels.to(args.device)
-            model.train()
-            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
-            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
-                    logging_loss = tr_loss
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    checkpoint_prefix = 'checkpoint'
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step))
-                    if not os.path.exists(output_dir):
-                        os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-                    _rotate_checkpoints(args, checkpoint_prefix)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    # Loop to handle MNLI double evaluation (matched, mis-matched)
-    eval_output_dir = args.output_dir
-
-    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
-
-    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(eval_output_dir)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
-    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-    # Eval!
-    logger.info("***** Running evaluation {} *****".format(prefix))
-    logger.info("  Num examples = %d", len(eval_dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-    eval_loss = 0.0
-    nb_eval_steps = 0
-    model.eval()
-
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        batch = batch.to(args.device)
-
-        with torch.no_grad():
-            outputs = model(batch, masked_lm_labels=batch) if args.mlm else model(batch, labels=batch)
-            lm_loss = outputs[0]
-            eval_loss += lm_loss.mean().item()
-        nb_eval_steps += 1
-
-    eval_loss = eval_loss / nb_eval_steps
-    perplexity = torch.exp(torch.tensor(eval_loss))
-
-    result = {
-        "perplexity": perplexity
-    }
-
-    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
-    with open(output_eval_file, "w") as writer:
-        logger.info("***** Eval results {} *****".format(prefix))
-        for key in sorted(result.keys()):
-            logger.info("  %s = %s", key, str(result[key]))
-            writer.write("%s = %s\n" % (key, str(result[key])))
-
-    return result
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    ## Required parameters
-    parser.add_argument("--train_data_file", default=None, type=str, required=True,
-                        help="The input training data file (a text file).")
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
-
-    ## Other parameters
-    parser.add_argument("--eval_data_file", default=None, type=str,
-                        help="An optional input evaluation data file to evaluate the perplexity on (a text file).")
-
-    parser.add_argument("--model_type", default="bert", type=str,
-                        help="The model architecture to be fine-tuned.")
-    parser.add_argument("--model_name_or_path", default="bert-base-cased", type=str,
-                        help="The model checkpoint for weights initialization.")
-
-    parser.add_argument("--mlm", action='store_true',
-                        help="Train with masked-language modeling loss instead of language modeling.")
-    parser.add_argument("--mlm_probability", type=float, default=0.15,
-                        help="Ratio of tokens to mask for masked language modeling loss")
-
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Optional pretrained config name or path if not the same as model_name_or_path")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)")
-    parser.add_argument("--block_size", default=-1, type=int,
-                        help="Optional input sequence length after tokenization."
-                             "The training dataset will be truncated in block of this size for training."
-                             "Default to the model max input length for single sentence inputs (take into account special tokens).")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Run evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-
-    parser.add_argument("--per_gpu_train_batch_size", default=4, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=1.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
-
-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument('--save_total_limit', type=int, default=None,
-                        help='Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default')
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
-
-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
-    args = parser.parse_args()
-
-    if args.model_type in ["bert", "roberta", "distilbert"] and not args.mlm:
-        raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
-                         "flag (masked language modeling).")
-    if args.eval_data_file is None and args.do_eval:
-        raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
-                         "or remove the --do_eval argument.")
-
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
-
-    # Set seed
-    set_seed(args)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab
-
-    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
-    if args.block_size <= 0:
-        args.block_size = tokenizer.max_len_single_sentence  # Our input block size will be the max possible for the model
-    args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)
-    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
-    model.to(args.device)
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # End of barrier to make sure only the first process in distributed training download model & vocab
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args.do_train:
-        if args.local_rank not in [-1, 0]:
-            torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)
-
-        if args.local_rank == 0:
-            torch.distributed.barrier()
-
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-
-    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Create output directory if needed
-        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-            os.makedirs(args.output_dir)
-
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        model.to(args.device)
-
-
-    # Evaluation
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-        for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
-            model = model_class.from_pretrained(checkpoint)
-            model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
-            results.update(result)
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 """ Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""

-from __future__ import absolute_import, division, print_function

 import argparse
 import glob
@@ -23,43 +22,50 @@ import logging
 import os
 import random

-
 import numpy as np
 import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
-from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange

-from transformers import (WEIGHTS_NAME, BertConfig,
-                                  BertForMultipleChoice, BertTokenizer,
-                                  XLNetConfig, XLNetForMultipleChoice,
-                                  XLNetTokenizer, RobertaConfig,
-                                  RobertaForMultipleChoice, RobertaTokenizer)
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    BertConfig,
+    BertForMultipleChoice,
+    BertTokenizer,
+    RobertaConfig,
+    RobertaForMultipleChoice,
+    RobertaTokenizer,
+    XLNetConfig,
+    XLNetForMultipleChoice,
+    XLNetTokenizer,
+    get_linear_schedule_with_warmup,
+)
+from utils_multiple_choice import convert_examples_to_features, processors

-from transformers import AdamW, WarmupLinearSchedule

-from utils_multiple_choice import (convert_examples_to_features, processors)
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    from tensorboardX import SummaryWriter
+

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), ())
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), ()
+)

 MODEL_CLASSES = {
-    'bert': (BertConfig, BertForMultipleChoice, BertTokenizer),
-    'xlnet': (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
-    'roberta': (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer)
+    "bert": (BertConfig, BertForMultipleChoice, BertTokenizer),
+    "xlnet": (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
+    "roberta": (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer),
 }

+
 def select_field(features, field):
-    return [
-        [
-            choice[field]
-            for choice in feature.choices_features
-        ]
-        for feature in features
-    ]
+    return [[choice[field] for choice in feature.choices_features] for feature in features]


 def simple_accuracy(preds, labels):
@@ -90,13 +96,18 @@ def train(args, train_dataset, model, tokenizer):
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ['bias', 'LayerNorm.weight']
+    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
    if args.fp16:
        try:
            from apex import amp
@@ -110,41 +121,49 @@ def train(args, train_dataset, model, tokenizer):

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
-    best_dev_acc, best_dev_loss = 0.0, 99999999999.0
+    best_dev_acc = 0.0
    best_steps = 0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
-    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
-                      'labels':         batch[3]}
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "token_type_ids": batch[2]
+                if args.model_type in ["bert", "xlnet"]
+                else None,  # XLM don't use segment_ids
+                "labels": batch[3],
+            }
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
-                loss = loss.mean() # mean() to average on multi-gpu parallel training
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

@@ -166,33 +185,45 @@ def train(args, train_dataset, model, tokenizer):

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
-                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
-                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                        if results["eval_acc"] > best_dev_acc:
                            best_dev_acc = results["eval_acc"]
-                            best_dev_loss = results["eval_loss"]
                            best_steps = global_step
                            if args.do_test:
                                results_test = evaluate(args, model, tokenizer, test=True)
                                for key, value in results_test.items():
-                                    tb_writer.add_scalar('test_{}'.format(key), value, global_step)
-                                logger.info("test acc: %s, loss: %s, global steps: %s", str(results_test['eval_acc']), str(results_test['eval_loss']), str(global_step))
-                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
-                    logger.info("Average loss: %s at global step: %s", str((tr_loss - logging_loss)/args.logging_steps), str(global_step))
+                                    tb_writer.add_scalar("test_{}".format(key), value, global_step)
+                                logger.info(
+                                    "test acc: %s, loss: %s, global steps: %s",
+                                    str(results_test["eval_acc"]),
+                                    str(results_test["eval_loss"]),
+                                    str(global_step),
+                                )
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
+                    logger.info(
+                        "Average loss: %s at global step: %s",
+                        str((tr_loss - logging_loss) / args.logging_steps),
+                        str(global_step),
+                    )
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_vocabulary(output_dir)
-                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
@@ -221,9 +252,13 @@ def evaluate(args, model, tokenizer, prefix="", test=False):

        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

+        # multi-gpu evaluate
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
@@ -237,10 +272,14 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
-                inputs = {'input_ids':      batch[0],
-                          'attention_mask': batch[1],
-                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
-                          'labels':         batch[3]}
+                inputs = {
+                    "input_ids": batch[0],
+                    "attention_mask": batch[1],
+                    "token_type_ids": batch[2]
+                    if args.model_type in ["bert", "xlnet"]
+                    else None,  # XLM don't use segment_ids
+                    "labels": batch[3],
+                }
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

@@ -248,10 +287,10 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
-                out_label_ids = inputs['labels'].detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        preds = np.argmax(preds, axis=1)
@@ -264,8 +303,14 @@ def evaluate(args, model, tokenizer, prefix="", test=False):
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(str(prefix) + " is test:" + str(test)))
            writer.write("model           =%s\n" % str(args.model_name_or_path))
-            writer.write("total batch size=%d\n" % (args.per_gpu_train_batch_size * args.gradient_accumulation_steps *
-                         (torch.distributed.get_world_size() if args.local_rank != -1 else 1)))
+            writer.write(
+                "total batch size=%d\n"
+                % (
+                    args.per_gpu_train_batch_size
+                    * args.gradient_accumulation_steps
+                    * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)
+                )
+            )
            writer.write("train num epochs=%d\n" % args.num_train_epochs)
            writer.write("fp16            =%s\n" % args.fp16)
            writer.write("max seq length  =%d\n" % args.max_seq_length)
@@ -282,17 +327,21 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
    processor = processors[task]()
    # Load data features from cache or dataset file
    if evaluate:
-        cached_mode = 'dev'
+        cached_mode = "dev"
    elif test:
-        cached_mode = 'test'
+        cached_mode = "test"
    else:
-        cached_mode = 'train'
-    assert (evaluate == True and test == True) == False
-    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
-        cached_mode,
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length),
-        str(task)))
+        cached_mode = "train"
+    assert not (evaluate and test)
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}_{}".format(
+            cached_mode,
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+            str(task),
+        ),
+    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
@@ -311,8 +360,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
            label_list,
            args.max_seq_length,
            tokenizer,
-            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0
+            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
+            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -322,9 +371,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long)
-    all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long)
-    all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long)
+    all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
+    all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
+    all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
    all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
@@ -334,92 +383,151 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
 def main():
    parser = argparse.ArgumentParser()

-    ## Required parameters
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_type", default=None, type=str, required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
-    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
-    parser.add_argument("--task_name", default=None, type=str, required=True,
-                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
-    parser.add_argument("--output_dir", default=None, type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    # Required parameters
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )

-    ## Other parameters
-    parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
-    parser.add_argument("--cache_dir", default="", type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.")
-    parser.add_argument("--do_train", action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval", action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set')
-    parser.add_argument("--evaluate_during_training", action='store_true',
-                        help="Run evaluation during training at each logging step.")
-    parser.add_argument("--do_lower_case", action='store_true',
-                        help="Set this flag if you are using an uncased model.")
+    # Other parameters
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_test", action="store_true", help="Whether to run test on the test set")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )

-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
-                        help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float,
-                        help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1, type=int,
-                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
-    parser.add_argument("--warmup_steps", default=0, type=int,
-                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

-    parser.add_argument('--logging_steps', type=int, default=50,
-                        help="Log every X updates steps.")
-    parser.add_argument('--save_steps', type=int, default=50,
-                        help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints", action='store_true',
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action='store_true',
-                        help="Avoid using CUDA when available")
-    parser.add_argument('--overwrite_output_dir', action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument('--overwrite_cache', action='store_true',
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument('--seed', type=int, default=42,
-                        help="random seed for initialization")
+    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

-    parser.add_argument('--fp16', action='store_true',
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
-    parser.add_argument('--fp16_opt_level', type=str, default='O1',
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html")
-    parser.add_argument("--local_rank", type=int, default=-1,
-                        help="For distributed training: local_rank")
-    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args()

-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
+
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
@@ -431,16 +539,24 @@ def main():
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend='nccl')
+        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )

    # Set seed
    set_seed(args)
@@ -459,9 +575,23 @@ def main():

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
-    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -477,7 +607,6 @@ def main():
        global_step, tr_loss, best_steps = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

-
    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
@@ -487,19 +616,20 @@ def main():
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

-
    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
@@ -507,17 +637,19 @@ def main():
            args.output_dir = args.model_name_or_path
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    if args.do_test and args.local_rank in [-1, 0]:
@@ -529,13 +661,13 @@ def main():
        #     logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
-            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
-            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix, test=True)
-            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)
    if best_steps:
        logger.info("best steps of eval acc is the following checkpoints: %s", best_steps)
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -0,0 +1,688 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert or Roberta). """
+
+
+import argparse
+import glob
+import logging
+import os
+import random
+
+import numpy as np
+import torch
+from seqeval.metrics import f1_score, precision_score, recall_score
+from torch.nn import CrossEntropyLoss
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    BertConfig,
+    BertForTokenClassification,
+    BertTokenizer,
+    CamembertConfig,
+    CamembertForTokenClassification,
+    CamembertTokenizer,
+    DistilBertConfig,
+    DistilBertForTokenClassification,
+    DistilBertTokenizer,
+    RobertaConfig,
+    RobertaForTokenClassification,
+    RobertaTokenizer,
+    XLMRobertaConfig,
+    XLMRobertaForTokenClassification,
+    XLMRobertaTokenizer,
+    get_linear_schedule_with_warmup,
+)
+from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
+
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    from tensorboardX import SummaryWriter
+
+
+logger = logging.getLogger(__name__)
+
+ALL_MODELS = sum(
+    (
+        tuple(conf.pretrained_config_archive_map.keys())
+        for conf in (BertConfig, RobertaConfig, DistilBertConfig, CamembertConfig, XLMRobertaConfig)
+    ),
+    (),
+)
+
+MODEL_CLASSES = {
+    "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
+    "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
+    "camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer),
+    "xlmroberta": (XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer),
+}
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
+
+    # Check if saved optimizer or scheduler states exist
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
+        # Load in optimizer and scheduler states
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
+
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    epochs_trained = 0
+    steps_trained_in_current_epoch = 0
+    # Check if continuing training from a checkpoint
+    if os.path.exists(args.model_name_or_path):
+        # set global_step to gobal_step of last saved checkpoint from model path
+        try:
+            global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
+        except ValueError:
+            global_step = 0
+        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
+        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
+
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+        logger.info("  Continuing training from epoch %d", epochs_trained)
+        logger.info("  Continuing training from global step %d", global_step)
+        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
+
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
+    set_seed(args)  # Added here for reproductibility
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+
+            # Skip past any already trained steps if resuming training
+            if steps_trained_in_current_epoch > 0:
+                steps_trained_in_current_epoch -= 1
+                continue
+
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert", "xlnet"] else None
+                )  # XLM and RoBERTa don"t use segment_ids
+
+            outputs = model(**inputs)
+            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+
+            if args.n_gpu > 1:
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+                scheduler.step()  # Update learning rate schedule
+                optimizer.step()
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
+                        results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev")
+                        for key, value in results.items():
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    tokenizer.save_pretrained(output_dir)
+
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
+                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""):
+    eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    # Note that DistributedSampler samples randomly
+    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # multi-gpu evaluate
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Eval!
+    logger.info("***** Running evaluation %s *****", prefix)
+    logger.info("  Num examples = %d", len(eval_dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    eval_loss = 0.0
+    nb_eval_steps = 0
+    preds = None
+    out_label_ids = None
+    model.eval()
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        batch = tuple(t.to(args.device) for t in batch)
+
+        with torch.no_grad():
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert", "xlnet"] else None
+                )  # XLM and RoBERTa don"t use segment_ids
+            outputs = model(**inputs)
+            tmp_eval_loss, logits = outputs[:2]
+
+            if args.n_gpu > 1:
+                tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu parallel evaluating
+
+            eval_loss += tmp_eval_loss.item()
+        nb_eval_steps += 1
+        if preds is None:
+            preds = logits.detach().cpu().numpy()
+            out_label_ids = inputs["labels"].detach().cpu().numpy()
+        else:
+            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+            out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
+
+    eval_loss = eval_loss / nb_eval_steps
+    preds = np.argmax(preds, axis=2)
+
+    label_map = {i: label for i, label in enumerate(labels)}
+
+    out_label_list = [[] for _ in range(out_label_ids.shape[0])]
+    preds_list = [[] for _ in range(out_label_ids.shape[0])]
+
+    for i in range(out_label_ids.shape[0]):
+        for j in range(out_label_ids.shape[1]):
+            if out_label_ids[i, j] != pad_token_label_id:
+                out_label_list[i].append(label_map[out_label_ids[i][j]])
+                preds_list[i].append(label_map[preds[i][j]])
+
+    results = {
+        "loss": eval_loss,
+        "precision": precision_score(out_label_list, preds_list),
+        "recall": recall_score(out_label_list, preds_list),
+        "f1": f1_score(out_label_list, preds_list),
+    }
+
+    logger.info("***** Eval results %s *****", prefix)
+    for key in sorted(results.keys()):
+        logger.info("  %s = %s", key, str(results[key]))
+
+    return results, preds_list
+
+
+def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
+    if args.local_rank not in [-1, 0] and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Load data features from cache or dataset file
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}".format(
+            mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length)
+        ),
+    )
+    if os.path.exists(cached_features_file) and not args.overwrite_cache:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+        examples = read_examples_from_file(args.data_dir, mode)
+        features = convert_examples_to_features(
+            examples,
+            labels,
+            args.max_seq_length,
+            tokenizer,
+            cls_token_at_end=bool(args.model_type in ["xlnet"]),
+            # xlnet has a cls token at the end
+            cls_token=tokenizer.cls_token,
+            cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
+            sep_token=tokenizer.sep_token,
+            sep_token_extra=bool(args.model_type in ["roberta"]),
+            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
+            pad_on_left=bool(args.model_type in ["xlnet"]),
+            # pad on the left for xlnet
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+            pad_token_label_id=pad_token_label_id,
+        )
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    if args.local_rank == 0 and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
+
+    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+
+    # Other parameters
+    parser.add_argument(
+        "--labels",
+        default="",
+        type=str,
+        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
+    )
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
+    parser.add_argument(
+        "--evaluate_during_training",
+        action="store_true",
+        help="Whether to run evaluation during training at each logging step.",
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+
+    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
+    args = parser.parse_args()
+
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend="nccl")
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
+
+    # Set seed
+    set_seed(args)
+
+    # Prepare CONLL-2003 task
+    labels = get_labels(args.labels)
+    num_labels = len(labels)
+    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
+    pad_token_label_id = CrossEntropyLoss().ignore_index
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    model.to(args.device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train")
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
+
+    # Evaluation
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
+            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+            result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step)
+            if global_step:
+                result = {"{}_{}".format(global_step, k): v for k, v in result.items()}
+            results.update(result)
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            for key in sorted(results.keys()):
+                writer.write("{} = {}\n".format(key, str(results[key])))
+
+    if args.do_predict and args.local_rank in [-1, 0]:
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        model = model_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+        result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")
+        # Save results
+        output_test_results_file = os.path.join(args.output_dir, "test_results.txt")
+        with open(output_test_results_file, "w") as writer:
+            for key in sorted(result.keys()):
+                writer.write("{} = {}\n".format(key, str(result[key])))
+        # Save predictions
+        output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt")
+        with open(output_test_predictions_file, "w") as writer:
+            with open(os.path.join(args.data_dir, "test.txt"), "r") as f:
+                example_id = 0
+                for line in f:
+                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
+                        writer.write(line)
+                        if not predictions[example_id]:
+                            example_id += 1
+                    elif predictions[example_id]:
+                        output_line = line.split()[0] + " " + predictions[example_id].pop(0) + "\n"
+                        writer.write(output_line)
+                    else:
+                        logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -1,40 +1,105 @@
+import os
+
 import tensorflow as tf
 import tensorflow_datasets
-from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features, BertForSequenceClassification

-# Load dataset, tokenizer, model from pretrained model/vocabulary
-tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
-data = tensorflow_datasets.load('glue/mrpc')
+from transformers import (
+    BertConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+    TFBertForSequenceClassification,
+    glue_convert_examples_to_features,
+    glue_processors,
+)
+
+
+# script parameters
+BATCH_SIZE = 32
+EVAL_BATCH_SIZE = BATCH_SIZE * 2
+USE_XLA = False
+USE_AMP = False
+EPOCHS = 3
+
+TASK = "mrpc"
+
+if TASK == "sst-2":
+    TFDS_TASK = "sst2"
+elif TASK == "sts-b":
+    TFDS_TASK = "stsb"
+else:
+    TFDS_TASK = TASK
+
+num_labels = len(glue_processors[TASK]().get_labels())
+print(num_labels)
+
+tf.config.optimizer.set_jit(USE_XLA)
+tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})
+
+# Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
+config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels)
+tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+model = TFBertForSequenceClassification.from_pretrained("bert-base-cased", config=config)
+
+# Load dataset via TensorFlow Datasets
+data, info = tensorflow_datasets.load(f"glue/{TFDS_TASK}", with_info=True)
+train_examples = info.splits["train"].num_examples
+
+# MNLI expects either validation_matched or validation_mismatched
+valid_examples = info.splits["validation"].num_examples

 # Prepare dataset for GLUE as a tf.data.Dataset instance
-train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc')
-valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc')
-train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
-valid_dataset = valid_dataset.batch(64)
+train_dataset = glue_convert_examples_to_features(data["train"], tokenizer, 128, TASK)

-# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
-optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
-model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+# MNLI expects either validation_matched or validation_mismatched
+valid_dataset = glue_convert_examples_to_features(data["validation"], tokenizer, 128, TASK)
+train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
+valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)
+
+# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
+opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
+if USE_AMP:
+    # loss scaling is currently required when using mixed precision
+    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")
+
+
+if num_labels == 1:
+    loss = tf.keras.losses.MeanSquaredError()
+else:
+    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+
+metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
+model.compile(optimizer=opt, loss=loss, metrics=[metric])

 # Train and evaluate using tf.keras.Model.fit()
-history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
-                    validation_data=valid_dataset, validation_steps=7)
+train_steps = train_examples // BATCH_SIZE
+valid_steps = valid_examples // EVAL_BATCH_SIZE

-# Load the TensorFlow model in PyTorch for inspection
-model.save_pretrained('./save/')
-pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
+history = model.fit(
+    train_dataset,
+    epochs=EPOCHS,
+    steps_per_epoch=train_steps,
+    validation_data=valid_dataset,
+    validation_steps=valid_steps,
+)

-# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
-sentence_0 = "This research was consistent with his findings."
-sentence_1 = "His findings were compatible with this research."
-sentence_2 = "His findings were not compatible with this research."
-inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
-inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
+# Save TF2 model
+os.makedirs("./save/", exist_ok=True)
+model.save_pretrained("./save/")

-pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
-pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
-print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
-print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
+if TASK == "mrpc":
+    # Load the TensorFlow model in PyTorch for inspection
+    # This is to demo the interoperability between the two frameworks, you don't have to
+    # do this in real life (you can run the inference on the TF model).
+    pytorch_model = BertForSequenceClassification.from_pretrained("./save/", from_tf=True)
+
+    # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
+    sentence_0 = "This research was consistent with his findings."
+    sentence_1 = "His findings were compatible with this research."
+    sentence_2 = "His findings were not compatible with this research."
+    inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors="pt")
+    inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors="pt")
+
+    pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
+    pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
+    print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
+    print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
--- a/examples/run_tf_ner.py
+++ b/examples/run_tf_ner.py
@@ -0,0 +1,655 @@
+# coding=utf-8
+import collections
+import datetime
+import glob
+import math
+import os
+import re
+
+import numpy as np
+import tensorflow as tf
+from absl import app, flags, logging
+from seqeval import metrics
+
+from transformers import (
+    TF2_WEIGHTS_NAME,
+    BertConfig,
+    BertTokenizer,
+    DistilBertConfig,
+    DistilBertTokenizer,
+    GradientAccumulator,
+    RobertaConfig,
+    RobertaTokenizer,
+    TFBertForTokenClassification,
+    TFDistilBertForTokenClassification,
+    TFRobertaForTokenClassification,
+    create_optimizer,
+)
+from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
+
+
+try:
+    from fastprogress import master_bar, progress_bar
+except ImportError:
+    from fastprogress.fastprogress import master_bar, progress_bar
+
+
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)), ()
+)
+
+MODEL_CLASSES = {
+    "bert": (BertConfig, TFBertForTokenClassification, BertTokenizer),
+    "roberta": (RobertaConfig, TFRobertaForTokenClassification, RobertaTokenizer),
+    "distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer),
+}
+
+
+flags.DEFINE_string(
+    "data_dir", None, "The input data dir. Should contain the .conll files (or other data files) " "for the task."
+)
+
+flags.DEFINE_string("model_type", None, "Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+
+flags.DEFINE_string(
+    "model_name_or_path",
+    None,
+    "Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+)
+
+flags.DEFINE_string("output_dir", None, "The output directory where the model checkpoints will be written.")
+
+flags.DEFINE_string(
+    "labels", "", "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."
+)
+
+flags.DEFINE_string("config_name", "", "Pretrained config name or path if not the same as model_name")
+
+flags.DEFINE_string("tokenizer_name", "", "Pretrained tokenizer name or path if not the same as model_name")
+
+flags.DEFINE_string("cache_dir", "", "Where do you want to store the pre-trained models downloaded from s3")
+
+flags.DEFINE_integer(
+    "max_seq_length",
+    128,
+    "The maximum total input sentence length after tokenization. "
+    "Sequences longer than this will be truncated, sequences shorter "
+    "will be padded.",
+)
+
+flags.DEFINE_string(
+    "tpu",
+    None,
+    "The Cloud TPU to use for training. This should be either the name "
+    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
+    "url.",
+)
+
+flags.DEFINE_integer("num_tpu_cores", 8, "Total number of TPU cores to use.")
+
+flags.DEFINE_boolean("do_train", False, "Whether to run training.")
+
+flags.DEFINE_boolean("do_eval", False, "Whether to run eval on the dev set.")
+
+flags.DEFINE_boolean("do_predict", False, "Whether to run predictions on the test set.")
+
+flags.DEFINE_boolean(
+    "evaluate_during_training", False, "Whether to run evaluation during training at each logging step."
+)
+
+flags.DEFINE_boolean("do_lower_case", False, "Set this flag if you are using an uncased model.")
+
+flags.DEFINE_integer("per_device_train_batch_size", 8, "Batch size per GPU/CPU/TPU for training.")
+
+flags.DEFINE_integer("per_device_eval_batch_size", 8, "Batch size per GPU/CPU/TPU for evaluation.")
+
+flags.DEFINE_integer(
+    "gradient_accumulation_steps", 1, "Number of updates steps to accumulate before performing a backward/update pass."
+)
+
+flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
+
+flags.DEFINE_float("weight_decay", 0.0, "Weight decay if we apply some.")
+
+flags.DEFINE_float("adam_epsilon", 1e-8, "Epsilon for Adam optimizer.")
+
+flags.DEFINE_float("max_grad_norm", 1.0, "Max gradient norm.")
+
+flags.DEFINE_integer("num_train_epochs", 3, "Total number of training epochs to perform.")
+
+flags.DEFINE_integer(
+    "max_steps", -1, "If > 0: set total number of training steps to perform. Override num_train_epochs."
+)
+
+flags.DEFINE_integer("warmup_steps", 0, "Linear warmup over warmup_steps.")
+
+flags.DEFINE_integer("logging_steps", 50, "Log every X updates steps.")
+
+flags.DEFINE_integer("save_steps", 50, "Save checkpoint every X updates steps.")
+
+flags.DEFINE_boolean(
+    "eval_all_checkpoints",
+    False,
+    "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+)
+
+flags.DEFINE_boolean("no_cuda", False, "Avoid using CUDA when available")
+
+flags.DEFINE_boolean("overwrite_output_dir", False, "Overwrite the content of the output directory")
+
+flags.DEFINE_boolean("overwrite_cache", False, "Overwrite the cached training and evaluation sets")
+
+flags.DEFINE_integer("seed", 42, "random seed for initialization")
+
+flags.DEFINE_boolean("fp16", False, "Whether to use 16-bit (mixed) precision instead of 32-bit")
+
+flags.DEFINE_string(
+    "gpus",
+    "0",
+    "Comma separated list of gpus devices. If only one, switch to single "
+    "gpu strategy, if None takes all the gpus available.",
+)
+
+
+def train(
+    args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id
+):
+    if args["max_steps"] > 0:
+        num_train_steps = args["max_steps"] * args["gradient_accumulation_steps"]
+        args["num_train_epochs"] = 1
+    else:
+        num_train_steps = (
+            math.ceil(num_train_examples / train_batch_size)
+            // args["gradient_accumulation_steps"]
+            * args["num_train_epochs"]
+        )
+
+    writer = tf.summary.create_file_writer("/tmp/mylogs")
+
+    with strategy.scope():
+        loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
+        optimizer = create_optimizer(args["learning_rate"], num_train_steps, args["warmup_steps"])
+
+        if args["fp16"]:
+            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
+
+        loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32)
+        gradient_accumulator = GradientAccumulator()
+
+    logging.info("***** Running training *****")
+    logging.info("  Num examples = %d", num_train_examples)
+    logging.info("  Num Epochs = %d", args["num_train_epochs"])
+    logging.info("  Instantaneous batch size per device = %d", args["per_device_train_batch_size"])
+    logging.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        train_batch_size * args["gradient_accumulation_steps"],
+    )
+    logging.info("  Gradient Accumulation steps = %d", args["gradient_accumulation_steps"])
+    logging.info("  Total training steps = %d", num_train_steps)
+
+    model.summary()
+
+    @tf.function
+    def apply_gradients():
+        grads_and_vars = []
+
+        for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables):
+            if gradient is not None:
+                scaled_gradient = gradient / (args["n_device"] * args["gradient_accumulation_steps"])
+                grads_and_vars.append((scaled_gradient, variable))
+            else:
+                grads_and_vars.append((gradient, variable))
+
+        optimizer.apply_gradients(grads_and_vars, args["max_grad_norm"])
+        gradient_accumulator.reset()
+
+    @tf.function
+    def train_step(train_features, train_labels):
+        def step_fn(train_features, train_labels):
+            inputs = {"attention_mask": train_features["input_mask"], "training": True}
+
+            if args["model_type"] != "distilbert":
+                inputs["token_type_ids"] = (
+                    train_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None
+                )
+
+            with tf.GradientTape() as tape:
+                logits = model(train_features["input_ids"], **inputs)[0]
+                logits = tf.reshape(logits, (-1, len(labels) + 1))
+                active_loss = tf.reshape(train_features["input_mask"], (-1,))
+                active_logits = tf.boolean_mask(logits, active_loss)
+                train_labels = tf.reshape(train_labels, (-1,))
+                active_labels = tf.boolean_mask(train_labels, active_loss)
+                cross_entropy = loss_fct(active_labels, active_logits)
+                loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size)
+                grads = tape.gradient(loss, model.trainable_variables)
+
+                gradient_accumulator(grads)
+
+            return cross_entropy
+
+        per_example_losses = strategy.experimental_run_v2(step_fn, args=(train_features, train_labels))
+        mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0)
+
+        return mean_loss
+
+    current_time = datetime.datetime.now()
+    train_iterator = master_bar(range(args["num_train_epochs"]))
+    global_step = 0
+    logging_loss = 0.0
+
+    for epoch in train_iterator:
+        epoch_iterator = progress_bar(
+            train_dataset, total=num_train_steps, parent=train_iterator, display=args["n_device"] > 1
+        )
+        step = 1
+
+        with strategy.scope():
+            for train_features, train_labels in epoch_iterator:
+                loss = train_step(train_features, train_labels)
+
+                if step % args["gradient_accumulation_steps"] == 0:
+                    strategy.experimental_run_v2(apply_gradients)
+
+                    loss_metric(loss)
+
+                    global_step += 1
+
+                    if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0:
+                        # Log metrics
+                        if (
+                            args["n_device"] == 1 and args["evaluate_during_training"]
+                        ):  # Only evaluate when single GPU otherwise metrics may not average well
+                            y_true, y_pred, eval_loss = evaluate(
+                                args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev"
+                            )
+                            report = metrics.classification_report(y_true, y_pred, digits=4)
+
+                            logging.info("Eval at step " + str(global_step) + "\n" + report)
+                            logging.info("eval_loss: " + str(eval_loss))
+
+                            precision = metrics.precision_score(y_true, y_pred)
+                            recall = metrics.recall_score(y_true, y_pred)
+                            f1 = metrics.f1_score(y_true, y_pred)
+
+                            with writer.as_default():
+                                tf.summary.scalar("eval_loss", eval_loss, global_step)
+                                tf.summary.scalar("precision", precision, global_step)
+                                tf.summary.scalar("recall", recall, global_step)
+                                tf.summary.scalar("f1", f1, global_step)
+
+                        lr = optimizer.learning_rate
+                        learning_rate = lr(step)
+
+                        with writer.as_default():
+                            tf.summary.scalar("lr", learning_rate, global_step)
+                            tf.summary.scalar(
+                                "loss", (loss_metric.result() - logging_loss) / args["logging_steps"], global_step
+                            )
+
+                        logging_loss = loss_metric.result()
+
+                    with writer.as_default():
+                        tf.summary.scalar("loss", loss_metric.result(), step=step)
+
+                    if args["save_steps"] > 0 and global_step % args["save_steps"] == 0:
+                        # Save model checkpoint
+                        output_dir = os.path.join(args["output_dir"], "checkpoint-{}".format(global_step))
+
+                        if not os.path.exists(output_dir):
+                            os.makedirs(output_dir)
+
+                        model.save_pretrained(output_dir)
+                        logging.info("Saving model checkpoint to %s", output_dir)
+
+                train_iterator.child.comment = f"loss : {loss_metric.result()}"
+                step += 1
+
+        train_iterator.write(f"loss epoch {epoch + 1}: {loss_metric.result()}")
+
+        loss_metric.reset_states()
+
+    logging.info("  Training took time = {}".format(datetime.datetime.now() - current_time))
+
+
+def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode):
+    eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"]
+    eval_dataset, size = load_and_cache_examples(
+        args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode
+    )
+    eval_dataset = strategy.experimental_distribute_dataset(eval_dataset)
+    preds = None
+    num_eval_steps = math.ceil(size / eval_batch_size)
+    master = master_bar(range(1))
+    eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args["n_device"] > 1)
+    loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
+    loss = 0.0
+
+    logging.info("***** Running evaluation *****")
+    logging.info("  Num examples = %d", size)
+    logging.info("  Batch size = %d", eval_batch_size)
+
+    for eval_features, eval_labels in eval_iterator:
+        inputs = {"attention_mask": eval_features["input_mask"], "training": False}
+
+        if args["model_type"] != "distilbert":
+            inputs["token_type_ids"] = (
+                eval_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None
+            )
+
+        with strategy.scope():
+            logits = model(eval_features["input_ids"], **inputs)[0]
+            tmp_logits = tf.reshape(logits, (-1, len(labels) + 1))
+            active_loss = tf.reshape(eval_features["input_mask"], (-1,))
+            active_logits = tf.boolean_mask(tmp_logits, active_loss)
+            tmp_eval_labels = tf.reshape(eval_labels, (-1,))
+            active_labels = tf.boolean_mask(tmp_eval_labels, active_loss)
+            cross_entropy = loss_fct(active_labels, active_logits)
+            loss += tf.reduce_sum(cross_entropy) * (1.0 / eval_batch_size)
+
+        if preds is None:
+            preds = logits.numpy()
+            label_ids = eval_labels.numpy()
+        else:
+            preds = np.append(preds, logits.numpy(), axis=0)
+            label_ids = np.append(label_ids, eval_labels.numpy(), axis=0)
+
+    preds = np.argmax(preds, axis=2)
+    y_pred = [[] for _ in range(label_ids.shape[0])]
+    y_true = [[] for _ in range(label_ids.shape[0])]
+    loss = loss / num_eval_steps
+
+    for i in range(label_ids.shape[0]):
+        for j in range(label_ids.shape[1]):
+            if label_ids[i, j] != pad_token_label_id:
+                y_pred[i].append(labels[preds[i, j] - 1])
+                y_true[i].append(labels[label_ids[i, j] - 1])
+
+    return y_true, y_pred, loss.numpy()
+
+
+def load_cache(cached_file, max_seq_length):
+    name_to_features = {
+        "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+        "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+        "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+        "label_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
+    }
+
+    def _decode_record(record):
+        example = tf.io.parse_single_example(record, name_to_features)
+        features = {}
+        features["input_ids"] = example["input_ids"]
+        features["input_mask"] = example["input_mask"]
+        features["segment_ids"] = example["segment_ids"]
+
+        return features, example["label_ids"]
+
+    d = tf.data.TFRecordDataset(cached_file)
+    d = d.map(_decode_record, num_parallel_calls=4)
+    count = d.reduce(0, lambda x, _: x + 1)
+
+    return d, count.numpy()
+
+
+def save_cache(features, cached_features_file):
+    writer = tf.io.TFRecordWriter(cached_features_file)
+
+    for (ex_index, feature) in enumerate(features):
+        if ex_index % 5000 == 0:
+            logging.info("Writing example %d of %d" % (ex_index, len(features)))
+
+        def create_int_feature(values):
+            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+            return f
+
+        record_feature = collections.OrderedDict()
+        record_feature["input_ids"] = create_int_feature(feature.input_ids)
+        record_feature["input_mask"] = create_int_feature(feature.input_mask)
+        record_feature["segment_ids"] = create_int_feature(feature.segment_ids)
+        record_feature["label_ids"] = create_int_feature(feature.label_ids)
+
+        tf_example = tf.train.Example(features=tf.train.Features(feature=record_feature))
+
+        writer.write(tf_example.SerializeToString())
+
+    writer.close()
+
+
+def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_size, mode):
+    drop_remainder = True if args["tpu"] or mode == "train" else False
+
+    # Load data features from cache or dataset file
+    cached_features_file = os.path.join(
+        args["data_dir"],
+        "cached_{}_{}_{}.tf_record".format(
+            mode, list(filter(None, args["model_name_or_path"].split("/"))).pop(), str(args["max_seq_length"])
+        ),
+    )
+    if os.path.exists(cached_features_file) and not args["overwrite_cache"]:
+        logging.info("Loading features from cached file %s", cached_features_file)
+        dataset, size = load_cache(cached_features_file, args["max_seq_length"])
+    else:
+        logging.info("Creating features from dataset file at %s", args["data_dir"])
+        examples = read_examples_from_file(args["data_dir"], mode)
+        features = convert_examples_to_features(
+            examples,
+            labels,
+            args["max_seq_length"],
+            tokenizer,
+            cls_token_at_end=bool(args["model_type"] in ["xlnet"]),
+            # xlnet has a cls token at the end
+            cls_token=tokenizer.cls_token,
+            cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0,
+            sep_token=tokenizer.sep_token,
+            sep_token_extra=bool(args["model_type"] in ["roberta"]),
+            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
+            pad_on_left=bool(args["model_type"] in ["xlnet"]),
+            # pad on the left for xlnet
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
+            pad_token_label_id=pad_token_label_id,
+        )
+        logging.info("Saving features into cached file %s", cached_features_file)
+        save_cache(features, cached_features_file)
+        dataset, size = load_cache(cached_features_file, args["max_seq_length"])
+
+    if mode == "train":
+        dataset = dataset.repeat()
+        dataset = dataset.shuffle(buffer_size=8192, seed=args["seed"])
+
+    dataset = dataset.batch(batch_size, drop_remainder)
+    dataset = dataset.prefetch(buffer_size=batch_size)
+
+    return dataset, size
+
+
+def main(_):
+    logging.set_verbosity(logging.INFO)
+    args = flags.FLAGS.flag_values_dict()
+
+    if (
+        os.path.exists(args["output_dir"])
+        and os.listdir(args["output_dir"])
+        and args["do_train"]
+        and not args["overwrite_output_dir"]
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args["output_dir"]
+            )
+        )
+
+    if args["fp16"]:
+        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
+
+    if args["tpu"]:
+        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args["tpu"])
+        tf.config.experimental_connect_to_cluster(resolver)
+        tf.tpu.experimental.initialize_tpu_system(resolver)
+        strategy = tf.distribute.experimental.TPUStrategy(resolver)
+        args["n_device"] = args["num_tpu_cores"]
+    elif len(args["gpus"].split(",")) > 1:
+        args["n_device"] = len([f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
+        strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
+    elif args["no_cuda"]:
+        args["n_device"] = 1
+        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
+    else:
+        args["n_device"] = len(args["gpus"].split(","))
+        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args["gpus"].split(",")[0])
+
+    logging.warning(
+        "n_device: %s, distributed training: %s, 16-bits training: %s",
+        args["n_device"],
+        bool(args["n_device"] > 1),
+        args["fp16"],
+    )
+
+    labels = get_labels(args["labels"])
+    num_labels = len(labels) + 1
+    pad_token_label_id = 0
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args["model_type"]]
+    config = config_class.from_pretrained(
+        args["config_name"] if args["config_name"] else args["model_name_or_path"],
+        num_labels=num_labels,
+        cache_dir=args["cache_dir"] if args["cache_dir"] else None,
+    )
+
+    logging.info("Training/evaluation parameters %s", args)
+
+    # Training
+    if args["do_train"]:
+        tokenizer = tokenizer_class.from_pretrained(
+            args["tokenizer_name"] if args["tokenizer_name"] else args["model_name_or_path"],
+            do_lower_case=args["do_lower_case"],
+            cache_dir=args["cache_dir"] if args["cache_dir"] else None,
+        )
+
+        with strategy.scope():
+            model = model_class.from_pretrained(
+                args["model_name_or_path"],
+                from_pt=bool(".bin" in args["model_name_or_path"]),
+                config=config,
+                cache_dir=args["cache_dir"] if args["cache_dir"] else None,
+            )
+            model.layers[-1].activation = tf.keras.activations.softmax
+
+        train_batch_size = args["per_device_train_batch_size"] * args["n_device"]
+        train_dataset, num_train_examples = load_and_cache_examples(
+            args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train"
+        )
+        train_dataset = strategy.experimental_distribute_dataset(train_dataset)
+        train(
+            args,
+            strategy,
+            train_dataset,
+            tokenizer,
+            model,
+            num_train_examples,
+            labels,
+            train_batch_size,
+            pad_token_label_id,
+        )
+
+        if not os.path.exists(args["output_dir"]):
+            os.makedirs(args["output_dir"])
+
+        logging.info("Saving model to %s", args["output_dir"])
+
+        model.save_pretrained(args["output_dir"])
+        tokenizer.save_pretrained(args["output_dir"])
+
+    # Evaluation
+    if args["do_eval"]:
+        tokenizer = tokenizer_class.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"])
+        checkpoints = []
+        results = []
+
+        if args["eval_all_checkpoints"]:
+            checkpoints = list(
+                os.path.dirname(c)
+                for c in sorted(
+                    glob.glob(args["output_dir"] + "/**/" + TF2_WEIGHTS_NAME, recursive=True),
+                    key=lambda f: int("".join(filter(str.isdigit, f)) or -1),
+                )
+            )
+
+        logging.info("Evaluate the following checkpoints: %s", checkpoints)
+
+        if len(checkpoints) == 0:
+            checkpoints.append(args["output_dir"])
+
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final"
+
+            with strategy.scope():
+                model = model_class.from_pretrained(checkpoint)
+
+            y_true, y_pred, eval_loss = evaluate(
+                args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev"
+            )
+            report = metrics.classification_report(y_true, y_pred, digits=4)
+
+            if global_step:
+                results.append({global_step + "_report": report, global_step + "_loss": eval_loss})
+
+        output_eval_file = os.path.join(args["output_dir"], "eval_results.txt")
+
+        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
+            for res in results:
+                for key, val in res.items():
+                    if "loss" in key:
+                        logging.info(key + " = " + str(val))
+                        writer.write(key + " = " + str(val))
+                        writer.write("\n")
+                    else:
+                        logging.info(key)
+                        logging.info("\n" + report)
+                        writer.write(key + "\n")
+                        writer.write(report)
+                        writer.write("\n")
+
+    if args["do_predict"]:
+        tokenizer = tokenizer_class.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"])
+        model = model_class.from_pretrained(args["output_dir"])
+        eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"]
+        predict_dataset, _ = load_and_cache_examples(
+            args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test"
+        )
+        y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test")
+        output_test_results_file = os.path.join(args["output_dir"], "test_results.txt")
+        output_test_predictions_file = os.path.join(args["output_dir"], "test_predictions.txt")
+        report = metrics.classification_report(y_true, y_pred, digits=4)
+
+        with tf.io.gfile.GFile(output_test_results_file, "w") as writer:
+            report = metrics.classification_report(y_true, y_pred, digits=4)
+
+            logging.info("\n" + report)
+
+            writer.write(report)
+            writer.write("\n\nloss = " + str(pred_loss))
+
+        with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer:
+            with tf.io.gfile.GFile(os.path.join(args["data_dir"], "test.txt"), "r") as f:
+                example_id = 0
+
+                for line in f:
+                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
+                        writer.write(line)
+
+                        if not y_pred[example_id]:
+                            example_id += 1
+                    elif y_pred[example_id]:
+                        output_line = line.split()[0] + " " + y_pred[example_id].pop(0) + "\n"
+                        writer.write(output_line)
+                    else:
+                        logging.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
+
+
+if __name__ == "__main__":
+    flags.mark_flag_as_required("data_dir")
+    flags.mark_flag_as_required("output_dir")
+    flags.mark_flag_as_required("model_name_or_path")
+    flags.mark_flag_as_required("model_type")
+    app.run(main)
--- a/examples/run_xnli.py
+++ b/examples/run_xnli.py
@@ -0,0 +1,653 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning multi-lingual models on XNLI (Bert, DistilBERT, XLM).
+    Adapted from `examples/run_glue.py`"""
+
+
+import argparse
+import glob
+import logging
+import os
+import random
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+from transformers import (
+    WEIGHTS_NAME,
+    AdamW,
+    BertConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+    DistilBertConfig,
+    DistilBertForSequenceClassification,
+    DistilBertTokenizer,
+    XLMConfig,
+    XLMForSequenceClassification,
+    XLMTokenizer,
+    get_linear_schedule_with_warmup,
+)
+from transformers import glue_convert_examples_to_features as convert_examples_to_features
+from transformers import xnli_compute_metrics as compute_metrics
+from transformers import xnli_output_modes as output_modes
+from transformers import xnli_processors as processors
+
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    from tensorboardX import SummaryWriter
+
+
+logger = logging.getLogger(__name__)
+
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), ()
+)
+
+MODEL_CLASSES = {
+    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
+    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
+    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
+}
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def train(args, train_dataset, model, tokenizer):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
+
+    # Check if saved optimizer or scheduler states exist
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
+        # Load in optimizer and scheduler states
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
+
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    epochs_trained = 0
+    steps_trained_in_current_epoch = 0
+    # Check if continuing training from a checkpoint
+    if os.path.exists(args.model_name_or_path):
+        # set global_step to gobal_step of last saved checkpoint from model path
+        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
+        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
+        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
+
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+        logger.info("  Continuing training from epoch %d", epochs_trained)
+        logger.info("  Continuing training from global step %d", global_step)
+        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
+
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
+    set_seed(args)  # Added here for reproductibility
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            # Skip past any already trained steps if resuming training
+            if steps_trained_in_current_epoch > 0:
+                steps_trained_in_current_epoch -= 1
+                continue
+
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+            if args.model_type != "distilbert":
+                inputs["token_type_ids"] = (
+                    batch[2] if args.model_type in ["bert"] else None
+                )  # XLM and DistilBERT don't use segment_ids
+            outputs = model(**inputs)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+
+            if args.n_gpu > 1:
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    tokenizer.save_pretrained(output_dir)
+
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
+                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    eval_task_names = (args.task_name,)
+    eval_outputs_dirs = (args.output_dir,)
+
+    results = {}
+    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
+        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
+
+        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(eval_output_dir)
+
+        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+        # Note that DistributedSampler samples randomly
+        eval_sampler = SequentialSampler(eval_dataset)
+        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+        # multi-gpu eval
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
+        # Eval!
+        logger.info("***** Running evaluation {} *****".format(prefix))
+        logger.info("  Num examples = %d", len(eval_dataset))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        eval_loss = 0.0
+        nb_eval_steps = 0
+        preds = None
+        out_label_ids = None
+        for batch in tqdm(eval_dataloader, desc="Evaluating"):
+            model.eval()
+            batch = tuple(t.to(args.device) for t in batch)
+
+            with torch.no_grad():
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+                if args.model_type != "distilbert":
+                    inputs["token_type_ids"] = (
+                        batch[2] if args.model_type in ["bert"] else None
+                    )  # XLM and DistilBERT don't use segment_ids
+                outputs = model(**inputs)
+                tmp_eval_loss, logits = outputs[:2]
+
+                eval_loss += tmp_eval_loss.mean().item()
+            nb_eval_steps += 1
+            if preds is None:
+                preds = logits.detach().cpu().numpy()
+                out_label_ids = inputs["labels"].detach().cpu().numpy()
+            else:
+                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
+
+        eval_loss = eval_loss / nb_eval_steps
+        if args.output_mode == "classification":
+            preds = np.argmax(preds, axis=1)
+        else:
+            raise ValueError("No other `output_mode` for XNLI.")
+        result = compute_metrics(eval_task, preds, out_label_ids)
+        results.update(result)
+
+        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results {} *****".format(prefix))
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+    return results
+
+
+def load_and_cache_examples(args, task, tokenizer, evaluate=False):
+    if args.local_rank not in [-1, 0] and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    processor = processors[task](language=args.language, train_language=args.train_language)
+    output_mode = output_modes[task]
+    # Load data features from cache or dataset file
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}_{}_{}".format(
+            "test" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+            str(task),
+            str(args.train_language if (not evaluate and args.train_language is not None) else args.language),
+        ),
+    )
+    if os.path.exists(cached_features_file) and not args.overwrite_cache:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+        label_list = processor.get_labels()
+        examples = (
+            processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+        )
+        features = convert_examples_to_features(
+            examples,
+            tokenizer,
+            label_list=label_list,
+            max_length=args.max_seq_length,
+            output_mode=output_mode,
+            pad_on_left=False,
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+            pad_token_segment_id=0,
+        )
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    if args.local_rank == 0 and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+    if output_mode == "classification":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+    else:
+        raise ValueError("No other `output_mode` for XNLI.")
+
+    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
+    )
+    parser.add_argument(
+        "--language",
+        default=None,
+        type=str,
+        required=True,
+        help="Evaluation language. Also train language if `train_language` is set to None.",
+    )
+    parser.add_argument(
+        "--train_language", default=None, type=str, help="Train language if is different of the evaluation language."
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+
+    # Other parameters
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.",
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the test set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+
+    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
+    args = parser.parse_args()
+
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend="nccl")
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
+
+    # Set seed
+    set_seed(args)
+
+    # Prepare XNLI task
+    args.task_name = "xnli"
+    if args.task_name not in processors:
+        raise ValueError("Task not found: %s" % (args.task_name))
+    processor = processors[args.task_name](language=args.language, train_language=args.train_language)
+    args.output_mode = output_modes[args.task_name]
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=args.task_name,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = tokenizer_class.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    model = model_class.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    model.to(args.device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+
+    # Evaluation
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            )
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
+
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, prefix=prefix)
+            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
+            results.update(result)
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/summarization/README.md
+++ b/examples/summarization/README.md
@@ -0,0 +1,61 @@
+# Text Summarization with Pretrained Encoders
+
+This folder contains part of the code necessary to reproduce the results on abstractive summarization from the article [Text Summarization with Pretrained Encoders](https://arxiv.org/pdf/1908.08345.pdf) by [Yang Liu](https://nlp-yang.github.io/) and [Mirella Lapata](https://homepages.inf.ed.ac.uk/mlap/). It can also be used to summarize any document.
+
+The original code can be found on the Yang Liu's [github repository](https://github.com/nlpyang/PreSumm).
+
+The model is loaded with the pre-trained weights for the abstractive summarization model trained on the CNN/Daily Mail dataset with an extractive and then abstractive tasks.
+
+## Setup
+
+```
+git clone https://github.com/huggingface/transformers && cd transformers
+pip install .
+pip install nltk py-rouge
+cd examples/summarization
+```
+
+## Reproduce the authors' results on ROUGE
+
+To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running:
+
+```bash
+tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
+```
+
+And move all the stories to the same folder. We will refer as `$DATA_PATH` the path to where you uncompressed both archive. Then run the following in the same folder as `run_summarization.py`:
+
+```bash
+python run_summarization.py \
+    --documents_dir $DATA_PATH \
+    --summaries_output_dir $SUMMARIES_PATH \ # optional
+    --no_cuda false \
+    --batch_size 4 \
+    --min_length 50 \
+    --max_length 200 \
+    --beam_size 5 \
+    --alpha 0.95 \
+    --block_trigram true \
+    --compute_rouge true
+```
+
+The scripts executes on GPU if one is available and if `no_cuda` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
+
+## Summarize any text
+
+Put the documents that you would like to summarize in a folder (the path to which is referred to as `$DATA_PATH` below) and run the following in the same folder as `run_summarization.py`:
+
+```bash
+python run_summarization.py \
+    --documents_dir $DATA_PATH \
+    --summaries_output_dir $SUMMARIES_PATH \ # optional
+    --no_cuda false \
+    --batch_size 4 \
+    --min_length 50 \
+    --max_length 200 \
+    --beam_size 5 \
+    --alpha 0.95 \
+    --block_trigram true \
+```
+
+You may want to play around with `min_length`, `max_length` and `alpha` to suit your use case. If you want to compute ROUGE on another dataset you will need to tweak the stories/summaries import in `utils_summarization.py` and tell it where to fetch the reference summaries.
--- a/examples/summarization/configuration_bertabs.py
+++ b/examples/summarization/configuration_bertabs.py
@@ -0,0 +1,98 @@
+# coding=utf-8
+# Copyright 2019 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BertAbs configuration """
+import logging
+
+from transformers import PretrainedConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+BERTABS_FINETUNED_CONFIG_MAP = {
+    "bertabs-finetuned-cnndm": "https://s3.amazonaws.com/models.huggingface.co/bert/remi/bertabs-finetuned-cnndm-extractive-abstractive-summarization-config.json",
+}
+
+
+class BertAbsConfig(PretrainedConfig):
+    r""" Class to store the configuration of the BertAbs model.
+
+    Arguments:
+        vocab_size: int
+            Number of tokens in the vocabulary.
+        max_pos: int
+            The maximum sequence length that this model will be used with.
+        enc_layer: int
+            The numner of hidden layers in the Transformer encoder.
+        enc_hidden_size: int
+            The size of the encoder's layers.
+        enc_heads: int
+            The number of attention heads for each attention layer in the encoder.
+        enc_ff_size: int
+            The size of the encoder's feed-forward layers.
+        enc_dropout: int
+            The dropout probabilitiy for all fully connected layers in the
+            embeddings, layers, pooler and also the attention probabilities in
+            the encoder.
+        dec_layer: int
+            The numner of hidden layers in the decoder.
+        dec_hidden_size: int
+            The size of the decoder's layers.
+        dec_heads: int
+            The number of attention heads for each attention layer in the decoder.
+        dec_ff_size: int
+            The size of the decoder's feed-forward layers.
+        dec_dropout: int
+            The dropout probabilitiy for all fully connected layers in the
+            embeddings, layers, pooler and also the attention probabilities in
+            the decoder.
+    """
+
+    pretrained_config_archive_map = BERTABS_FINETUNED_CONFIG_MAP
+    model_type = "bertabs"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        max_pos=512,
+        enc_layers=6,
+        enc_hidden_size=512,
+        enc_heads=8,
+        enc_ff_size=512,
+        enc_dropout=0.2,
+        dec_layers=6,
+        dec_hidden_size=768,
+        dec_heads=8,
+        dec_ff_size=2048,
+        dec_dropout=0.2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.vocab_size = vocab_size
+        self.max_pos = max_pos
+
+        self.enc_layers = enc_layers
+        self.enc_hidden_size = enc_hidden_size
+        self.enc_heads = enc_heads
+        self.enc_ff_size = enc_ff_size
+        self.enc_dropout = enc_dropout
+
+        self.dec_layers = dec_layers
+        self.dec_hidden_size = dec_hidden_size
+        self.dec_heads = dec_heads
+        self.dec_ff_size = dec_ff_size
+        self.dec_dropout = dec_dropout
--- a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py
@@ -0,0 +1,176 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Convert BertExtAbs's checkpoints.
+
+The script looks like it is doing something trivial but it is not. The "weights"
+proposed by the authors are actually the entire model pickled. We need to load
+the model within the original codebase to be able to only save its `state_dict`.
+"""
+
+import argparse
+import logging
+from collections import namedtuple
+
+import torch
+
+from model_bertabs import BertAbsSummarizer
+from models.model_builder import AbsSummarizer  # The authors' implementation
+from transformers import BertTokenizer
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+SAMPLE_TEXT = "Hello world! cécé herlolip"
+
+
+BertAbsConfig = namedtuple(
+    "BertAbsConfig",
+    [
+        "temp_dir",
+        "large",
+        "use_bert_emb",
+        "finetune_bert",
+        "encoder",
+        "share_emb",
+        "max_pos",
+        "enc_layers",
+        "enc_hidden_size",
+        "enc_heads",
+        "enc_ff_size",
+        "enc_dropout",
+        "dec_layers",
+        "dec_hidden_size",
+        "dec_heads",
+        "dec_ff_size",
+        "dec_dropout",
+    ],
+)
+
+
+def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
+    """ Copy/paste and tweak the pre-trained weights provided by the creators
+    of BertAbs for the internal architecture.
+    """
+
+    # Instantiate the authors' model with the pre-trained weights
+    config = BertAbsConfig(
+        temp_dir=".",
+        finetune_bert=False,
+        large=False,
+        share_emb=True,
+        use_bert_emb=False,
+        encoder="bert",
+        max_pos=512,
+        enc_layers=6,
+        enc_hidden_size=512,
+        enc_heads=8,
+        enc_ff_size=512,
+        enc_dropout=0.2,
+        dec_layers=6,
+        dec_hidden_size=768,
+        dec_heads=8,
+        dec_ff_size=2048,
+        dec_dropout=0.2,
+    )
+    checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage)
+    original = AbsSummarizer(config, torch.device("cpu"), checkpoints)
+    original.eval()
+
+    new_model = BertAbsSummarizer(config, torch.device("cpu"))
+    new_model.eval()
+
+    # -------------------
+    # Convert the weights
+    # -------------------
+
+    logging.info("convert the model")
+    new_model.bert.load_state_dict(original.bert.state_dict())
+    new_model.decoder.load_state_dict(original.decoder.state_dict())
+    new_model.generator.load_state_dict(original.generator.state_dict())
+
+    # ----------------------------------
+    # Make sure the outpus are identical
+    # ----------------------------------
+
+    logging.info("Make sure that the models' outputs are identical")
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+
+    # prepare the model inputs
+    encoder_input_ids = tokenizer.encode("This is sample éàalj'-.")
+    encoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(encoder_input_ids)))
+    encoder_input_ids = torch.tensor(encoder_input_ids).unsqueeze(0)
+    decoder_input_ids = tokenizer.encode("This is sample 3 éàalj'-.")
+    decoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(decoder_input_ids)))
+    decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0)
+
+    # failsafe to make sure the weights reset does not affect the
+    # loaded weights.
+    assert torch.max(torch.abs(original.generator[0].weight - new_model.generator[0].weight)) == 0
+
+    # forward pass
+    src = encoder_input_ids
+    tgt = decoder_input_ids
+    segs = token_type_ids = None
+    clss = None
+    mask_src = encoder_attention_mask = None
+    mask_tgt = decoder_attention_mask = None
+    mask_cls = None
+
+    # The original model does not apply the geneator layer immediatly but rather in
+    # the beam search (where it combines softmax + linear layer). Since we already
+    # apply the softmax in our generation process we only apply the linear layer here.
+    # We make sure that the outputs of the full stack are identical
+    output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0]
+    output_original_generator = original.generator(output_original_model)
+
+    output_converted_model = new_model(
+        encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask
+    )[0]
+    output_converted_generator = new_model.generator(output_converted_model)
+
+    maximum_absolute_difference = torch.max(torch.abs(output_converted_model - output_original_model)).item()
+    print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
+    maximum_absolute_difference = torch.max(torch.abs(output_converted_generator - output_original_generator)).item()
+    print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference))
+
+    are_identical = torch.allclose(output_converted_model, output_original_model, atol=1e-3)
+    if are_identical:
+        logging.info("all weights are equal up to 1e-3")
+    else:
+        raise ValueError("the weights are different. The new model is likely different from the original one.")
+
+    # The model has been saved with torch.save(model) and this is bound to the exact
+    # directory structure. We save the state_dict instead.
+    logging.info("saving the model's state dictionary")
+    torch.save(
+        new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin"
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--bertabs_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model.",
+    )
+    args = parser.parse_args()
+
+    convert_bertabs_checkpoints(
+        args.bertabs_checkpoint_path, args.pytorch_dump_folder_path,
+    )
--- a/examples/summarization/modeling_bertabs.py
+++ b/examples/summarization/modeling_bertabs.py
--- a/examples/summarization/requirements.txt
+++ b/examples/summarization/requirements.txt
@@ -0,0 +1,5 @@
+transformers
+
+# For ROUGE
+nltk
+py-rouge
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -0,0 +1,323 @@
+#! /usr/bin/python3
+import argparse
+import logging
+import os
+import sys
+from collections import namedtuple
+
+import torch
+from torch.utils.data import DataLoader, SequentialSampler
+from tqdm import tqdm
+
+from modeling_bertabs import BertAbs, build_predictor
+from transformers import BertTokenizer
+from utils_summarization import (
+    SummarizationDataset,
+    build_mask,
+    compute_token_type_ids,
+    encode_for_summarization,
+    fit_to_block_size,
+)
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+
+Batch = namedtuple("Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"])
+
+
+def evaluate(args):
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
+    model = BertAbs.from_pretrained("bertabs-finetuned-cnndm")
+    model.to(args.device)
+    model.eval()
+
+    symbols = {
+        "BOS": tokenizer.vocab["[unused0]"],
+        "EOS": tokenizer.vocab["[unused1]"],
+        "PAD": tokenizer.vocab["[PAD]"],
+    }
+
+    if args.compute_rouge:
+        reference_summaries = []
+        generated_summaries = []
+
+        import rouge
+        import nltk
+
+        nltk.download("punkt")
+        rouge_evaluator = rouge.Rouge(
+            metrics=["rouge-n", "rouge-l"],
+            max_n=2,
+            limit_length=True,
+            length_limit=args.beam_size,
+            length_limit_type="words",
+            apply_avg=True,
+            apply_best=False,
+            alpha=0.5,  # Default F1_score
+            weight_factor=1.2,
+            stemming=True,
+        )
+
+    # these (unused) arguments are defined to keep the compatibility
+    # with the legacy code and will be deleted in a next iteration.
+    args.result_path = ""
+    args.temp_dir = ""
+
+    data_iterator = build_data_iterator(args, tokenizer)
+    predictor = build_predictor(args, tokenizer, symbols, model)
+
+    logger.info("***** Running evaluation *****")
+    logger.info("  Number examples = %d", len(data_iterator.dataset))
+    logger.info("  Batch size = %d", args.batch_size)
+    logger.info("")
+    logger.info("***** Beam Search parameters *****")
+    logger.info("  Beam size = %d", args.beam_size)
+    logger.info("  Minimum length = %d", args.min_length)
+    logger.info("  Maximum length = %d", args.max_length)
+    logger.info("  Alpha (length penalty) = %.2f", args.alpha)
+    logger.info("  Trigrams %s be blocked", ("will" if args.block_trigram else "will NOT"))
+
+    for batch in tqdm(data_iterator):
+        batch_data = predictor.translate_batch(batch)
+        translations = predictor.from_batch(batch_data)
+        summaries = [format_summary(t) for t in translations]
+        save_summaries(summaries, args.summaries_output_dir, batch.document_names)
+
+        if args.compute_rouge:
+            reference_summaries += batch.tgt_str
+            generated_summaries += summaries
+
+    if args.compute_rouge:
+        scores = rouge_evaluator.get_scores(generated_summaries, reference_summaries)
+        str_scores = format_rouge_scores(scores)
+        save_rouge_scores(str_scores)
+        print(str_scores)
+
+
+def save_summaries(summaries, path, original_document_name):
+    """ Write the summaries in fies that are prefixed by the original
+    files' name with the `_summary` appended.
+
+    Attributes:
+        original_document_names: List[string]
+            Name of the document that was summarized.
+        path: string
+            Path were the summaries will be written
+        summaries: List[string]
+            The summaries that we produced.
+    """
+    for summary, document_name in zip(summaries, original_document_name):
+        # Prepare the summary file's name
+        if "." in document_name:
+            bare_document_name = ".".join(document_name.split(".")[:-1])
+            extension = document_name.split(".")[-1]
+            name = bare_document_name + "_summary." + extension
+        else:
+            name = document_name + "_summary"
+
+        file_path = os.path.join(path, name)
+        with open(file_path, "w") as output:
+            output.write(summary)
+
+
+def format_summary(translation):
+    """ Transforms the output of the `from_batch` function
+    into nicely formatted summaries.
+    """
+    raw_summary, _, _ = translation
+    summary = (
+        raw_summary.replace("[unused0]", "")
+        .replace("[unused3]", "")
+        .replace("[PAD]", "")
+        .replace("[unused1]", "")
+        .replace(r" +", " ")
+        .replace(" [unused2] ", ". ")
+        .replace("[unused2]", "")
+        .strip()
+    )
+
+    return summary
+
+
+def format_rouge_scores(scores):
+    return """\n
+****** ROUGE SCORES ******
+
+** ROUGE 1
+F1        >> {:.3f}
+Precision >> {:.3f}
+Recall    >> {:.3f}
+
+** ROUGE 2
+F1        >> {:.3f}
+Precision >> {:.3f}
+Recall    >> {:.3f}
+
+** ROUGE L
+F1        >> {:.3f}
+Precision >> {:.3f}
+Recall    >> {:.3f}""".format(
+        scores["rouge-1"]["f"],
+        scores["rouge-1"]["p"],
+        scores["rouge-1"]["r"],
+        scores["rouge-2"]["f"],
+        scores["rouge-2"]["p"],
+        scores["rouge-2"]["r"],
+        scores["rouge-l"]["f"],
+        scores["rouge-l"]["p"],
+        scores["rouge-l"]["r"],
+    )
+
+
+def save_rouge_scores(str_scores):
+    with open("rouge_scores.txt", "w") as output:
+        output.write(str_scores)
+
+
+#
+# LOAD the dataset
+#
+
+
+def build_data_iterator(args, tokenizer):
+    dataset = load_and_cache_examples(args, tokenizer)
+    sampler = SequentialSampler(dataset)
+
+    def collate_fn(data):
+        return collate(data, tokenizer, block_size=512, device=args.device)
+
+    iterator = DataLoader(dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,)
+
+    return iterator
+
+
+def load_and_cache_examples(args, tokenizer):
+    dataset = SummarizationDataset(args.documents_dir)
+    return dataset
+
+
+def collate(data, tokenizer, block_size, device):
+    """ Collate formats the data passed to the data loader.
+
+    In particular we tokenize the data batch after batch to avoid keeping them
+    all in memory. We output the data as a namedtuple to fit the original BertAbs's
+    API.
+    """
+    data = [x for x in data if not len(x[1]) == 0]  # remove empty_files
+    names = [name for name, _, _ in data]
+    summaries = [" ".join(summary_list) for _, _, summary_list in data]
+
+    encoded_text = [encode_for_summarization(story, summary, tokenizer) for _, story, summary in data]
+    encoded_stories = torch.tensor(
+        [fit_to_block_size(story, block_size, tokenizer.pad_token_id) for story, _ in encoded_text]
+    )
+    encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id)
+    encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id)
+
+    batch = Batch(
+        document_names=names,
+        batch_size=len(encoded_stories),
+        src=encoded_stories.to(device),
+        segs=encoder_token_type_ids.to(device),
+        mask_src=encoder_mask.to(device),
+        tgt_str=summaries,
+    )
+
+    return batch
+
+
+def decode_summary(summary_tokens, tokenizer):
+    """ Decode the summary and return it in a format
+    suitable for evaluation.
+    """
+    summary_tokens = summary_tokens.to("cpu").numpy()
+    summary = tokenizer.decode(summary_tokens)
+    sentences = summary.split(".")
+    sentences = [s + "." for s in sentences]
+    return sentences
+
+
+def main():
+    """ The main function defines the interface with the users.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--documents_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The folder where the documents to summarize are located.",
+    )
+    parser.add_argument(
+        "--summaries_output_dir",
+        default=None,
+        type=str,
+        required=False,
+        help="The folder in wich the summaries should be written. Defaults to the folder where the documents are",
+    )
+    parser.add_argument(
+        "--compute_rouge",
+        default=False,
+        type=bool,
+        required=False,
+        help="Compute the ROUGE metrics during evaluation. Only available for the CNN/DailyMail dataset.",
+    )
+    # EVALUATION options
+    parser.add_argument(
+        "--no_cuda", default=False, type=bool, help="Whether to force the execution on CPU.",
+    )
+    parser.add_argument(
+        "--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.",
+    )
+    # BEAM SEARCH arguments
+    parser.add_argument(
+        "--min_length", default=50, type=int, help="Minimum number of tokens for the summaries.",
+    )
+    parser.add_argument(
+        "--max_length", default=200, type=int, help="Maixmum number of tokens for the summaries.",
+    )
+    parser.add_argument(
+        "--beam_size", default=5, type=int, help="The number of beams to start with for each example.",
+    )
+    parser.add_argument(
+        "--alpha", default=0.95, type=float, help="The value of alpha for the length penalty in the beam search.",
+    )
+    parser.add_argument(
+        "--block_trigram",
+        default=True,
+        type=bool,
+        help="Whether to block the existence of repeating trigrams in the text generated by beam search.",
+    )
+    args = parser.parse_args()
+
+    # Select device (distibuted not available)
+    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+
+    # Check the existence of directories
+    if not args.summaries_output_dir:
+        args.summaries_output_dir = args.documents_dir
+
+    if not documents_dir_is_valid(args.documents_dir):
+        raise FileNotFoundError(
+            "We could not find the directory you specified for the documents to summarize, or it was empty. Please specify a valid path."
+        )
+    os.makedirs(args.summaries_output_dir, exist_ok=True)
+
+    evaluate(args)
+
+
+def documents_dir_is_valid(path):
+    if not os.path.exists(path):
+        return False
+
+    file_list = os.listdir(path)
+    if len(file_list) == 0:
+        return False
+
+    return True
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/summarization/test_utils_summarization.py
+++ b/examples/summarization/test_utils_summarization.py
@@ -0,0 +1,100 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import torch
+
+from utils_summarization import build_mask, compute_token_type_ids, fit_to_block_size, process_story
+
+
+class SummarizationDataProcessingTest(unittest.TestCase):
+    def setUp(self):
+        self.block_size = 10
+
+    def test_fit_to_block_sequence_too_small(self):
+        """ Pad the sequence with 0 if the sequence is smaller than the block size."""
+        sequence = [1, 2, 3, 4]
+        expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
+        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
+
+    def test_fit_to_block_sequence_fit_exactly(self):
+        """ Do nothing if the sequence is the right size. """
+        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
+
+    def test_fit_to_block_sequence_too_big(self):
+        """ Truncate the sequence if it is too long. """
+        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
+        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output)
+
+    def test_process_story_no_highlights(self):
+        """ Processing a story with no highlights returns an empty list for the summary.
+        """
+        raw_story = """It was the year of Our Lord one thousand seven hundred and
+        seventy-five.\n\nSpiritual revelations were conceded to England at that
+        favoured period, as at this."""
+        _, summary_lines = process_story(raw_story)
+        self.assertEqual(summary_lines, [])
+
+    def test_process_empty_story(self):
+        """ An empty story returns an empty collection of lines.
+        """
+        raw_story = ""
+        story_lines, summary_lines = process_story(raw_story)
+        self.assertEqual(story_lines, [])
+        self.assertEqual(summary_lines, [])
+
+    def test_process_story_with_missing_period(self):
+        raw_story = (
+            "It was the year of Our Lord one thousand seven hundred and "
+            "seventy-five\n\nSpiritual revelations were conceded to England "
+            "at that favoured period, as at this.\n@highlight\n\nIt was the best of times"
+        )
+        story_lines, summary_lines = process_story(raw_story)
+
+        expected_story_lines = [
+            "It was the year of Our Lord one thousand seven hundred and seventy-five.",
+            "Spiritual revelations were conceded to England at that favoured period, as at this.",
+        ]
+        self.assertEqual(expected_story_lines, story_lines)
+
+        expected_summary_lines = ["It was the best of times."]
+        self.assertEqual(expected_summary_lines, summary_lines)
+
+    def test_build_mask_no_padding(self):
+        sequence = torch.tensor([1, 2, 3, 4])
+        expected = torch.tensor([1, 1, 1, 1])
+        np.testing.assert_array_equal(build_mask(sequence, 0).numpy(), expected.numpy())
+
+    def test_build_mask(self):
+        sequence = torch.tensor([1, 2, 3, 4, 23, 23, 23])
+        expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
+        np.testing.assert_array_equal(build_mask(sequence, 23).numpy(), expected.numpy())
+
+    def test_build_mask_with_padding_equal_to_one(self):
+        sequence = torch.tensor([8, 2, 3, 4, 1, 1, 1])
+        expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
+        np.testing.assert_array_equal(build_mask(sequence, 1).numpy(), expected.numpy())
+
+    def test_compute_token_type_ids(self):
+        separator = 101
+        batch = torch.tensor([[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]])
+        expected = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]])
+
+        result = compute_token_type_ids(batch, separator)
+        np.testing.assert_array_equal(result, expected)
--- a/examples/summarization/utils_summarization.py
+++ b/examples/summarization/utils_summarization.py
@@ -0,0 +1,167 @@
+import os
+from collections import deque
+
+import torch
+from torch.utils.data import Dataset
+
+
+# ------------
+# Data loading
+# ------------
+
+
+class SummarizationDataset(Dataset):
+    """ Abstracts the dataset used to train seq2seq models.
+
+    The class will process the documents that are located in the specified
+    folder. The preprocessing will work on any document that is reasonably
+    formatted. On the CNN/DailyMail dataset it will extract both the story
+    and the summary.
+
+    CNN/Daily News:
+
+    The CNN/Daily News raw datasets are downloaded from [1]. The stories are
+    stored in different files; the summary appears at the end of the story as
+    sentences that are prefixed by the special `@highlight` line. To process
+    the data, untar both datasets in the same folder, and pass the path to this
+    folder as the "data_dir argument. The formatting code was inspired by [2].
+
+    [1] https://cs.nyu.edu/~kcho/
+    [2] https://github.com/abisee/cnn-dailymail/
+    """
+
+    def __init__(self, path="", prefix="train"):
+        """ We initialize the class by listing all the documents to summarize.
+        Files are not read in memory due to the size of some datasets (like CNN/DailyMail).
+        """
+        assert os.path.isdir(path)
+
+        self.documents = []
+        story_filenames_list = os.listdir(path)
+        for story_filename in story_filenames_list:
+            if "summary" in story_filename:
+                continue
+            path_to_story = os.path.join(path, story_filename)
+            if not os.path.isfile(path_to_story):
+                continue
+            self.documents.append(path_to_story)
+
+    def __len__(self):
+        """ Returns the number of documents. """
+        return len(self.documents)
+
+    def __getitem__(self, idx):
+        document_path = self.documents[idx]
+        document_name = document_path.split("/")[-1]
+        with open(document_path, encoding="utf-8") as source:
+            raw_story = source.read()
+            story_lines, summary_lines = process_story(raw_story)
+        return document_name, story_lines, summary_lines
+
+
+def process_story(raw_story):
+    """ Extract the story and summary from a story file.
+
+    Attributes:
+        raw_story (str): content of the story file as an utf-8 encoded string.
+
+    Raises:
+        IndexError: If the stoy is empty or contains no highlights.
+    """
+    nonempty_lines = list(filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")]))
+
+    # for some unknown reason some lines miss a period, add it
+    nonempty_lines = [_add_missing_period(line) for line in nonempty_lines]
+
+    # gather article lines
+    story_lines = []
+    lines = deque(nonempty_lines)
+    while True:
+        try:
+            element = lines.popleft()
+            if element.startswith("@highlight"):
+                break
+            story_lines.append(element)
+        except IndexError:
+            # if "@highlight" is absent from the file we pop
+            # all elements until there is None, raising an exception.
+            return story_lines, []
+
+    # gather summary lines
+    summary_lines = list(filter(lambda t: not t.startswith("@highlight"), lines))
+
+    return story_lines, summary_lines
+
+
+def _add_missing_period(line):
+    END_TOKENS = [".", "!", "?", "...", "'", "`", '"', "\u2019", "\u2019", ")"]
+    if line.startswith("@highlight"):
+        return line
+    if line[-1] in END_TOKENS:
+        return line
+    return line + "."
+
+
+# --------------------------
+# Encoding and preprocessing
+# --------------------------
+
+
+def fit_to_block_size(sequence, block_size, pad_token_id):
+    """ Adapt the source and target sequences' lengths to the block size.
+    If the sequence is shorter we append padding token to the right of the sequence.
+    """
+    if len(sequence) > block_size:
+        return sequence[:block_size]
+    else:
+        sequence.extend([pad_token_id] * (block_size - len(sequence)))
+        return sequence
+
+
+def build_mask(sequence, pad_token_id):
+    """ Builds the mask. The attention mechanism will only attend to positions
+    with value 1. """
+    mask = torch.ones_like(sequence)
+    idx_pad_tokens = sequence == pad_token_id
+    mask[idx_pad_tokens] = 0
+    return mask
+
+
+def encode_for_summarization(story_lines, summary_lines, tokenizer):
+    """ Encode the story and summary lines, and join them
+    as specified in [1] by using `[SEP] [CLS]` tokens to separate
+    sentences.
+    """
+    story_lines_token_ids = [tokenizer.encode(line) for line in story_lines]
+    story_token_ids = [token for sentence in story_lines_token_ids for token in sentence]
+    summary_lines_token_ids = [tokenizer.encode(line) for line in summary_lines]
+    summary_token_ids = [token for sentence in summary_lines_token_ids for token in sentence]
+
+    return story_token_ids, summary_token_ids
+
+
+def compute_token_type_ids(batch, separator_token_id):
+    """ Segment embeddings as described in [1]
+
+    The values {0,1} were found in the repository [2].
+
+    Attributes:
+        batch: torch.Tensor, size [batch_size, block_size]
+            Batch of input.
+        separator_token_id: int
+            The value of the token that separates the segments.
+
+    [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
+        arXiv preprint arXiv:1908.08345 (2019).
+    [2] https://github.com/nlpyang/PreSumm (/src/prepro/data_builder.py, commit fac1217)
+    """
+    batch_embeddings = []
+    for sequence in batch:
+        sentence_num = -1
+        embeddings = []
+        for s in sequence:
+            if s == separator_token_id:
+                sentence_num += 1
+            embeddings.append(sentence_num % 2)
+        batch_embeddings.append(embeddings)
+    return torch.tensor(batch_embeddings)
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -12,57 +12,53 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function

-import sys
-import unittest
+
 import argparse
 import logging
+import sys
+import unittest
+from unittest.mock import patch

-try:
-    # python 3.4+ can use builtin unittest.mock instead of mock package
-    from unittest.mock import patch
-except ImportError:
-    from mock import patch
-
+import run_generation
 import run_glue
 import run_squad
-import run_generation
+

 logging.basicConfig(level=logging.DEBUG)

 logger = logging.getLogger()

+
 def get_setup_file():
    parser = argparse.ArgumentParser()
-    parser.add_argument('-f')
+    parser.add_argument("-f")
    args = parser.parse_args()
    return args.f

-class ExamplesTests(unittest.TestCase):

+class ExamplesTests(unittest.TestCase):
    def test_run_glue(self):
        stream_handler = logging.StreamHandler(sys.stdout)
        logger.addHandler(stream_handler)

-        testargs = ["run_glue.py",
-                    "--data_dir=./examples/tests_samples/MRPC/",
-                    "--task_name=mrpc",
-                    "--do_train",
-                    "--do_eval",
-                    "--output_dir=./examples/tests_samples/temp_dir",
-                    "--per_gpu_train_batch_size=2",
-                    "--per_gpu_eval_batch_size=1",
-                    "--learning_rate=1e-4",
-                    "--max_steps=10",
-                    "--warmup_steps=2",
-                    "--overwrite_output_dir",
-                    "--seed=42"]
-        model_type, model_name = ("--model_type=bert",
-                                  "--model_name_or_path=bert-base-uncased")
-        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+        testargs = [
+            "run_glue.py",
+            "--data_dir=./examples/tests_samples/MRPC/",
+            "--task_name=mrpc",
+            "--do_train",
+            "--do_eval",
+            "--output_dir=./examples/tests_samples/temp_dir",
+            "--per_gpu_train_batch_size=2",
+            "--per_gpu_eval_batch_size=1",
+            "--learning_rate=1e-4",
+            "--max_steps=10",
+            "--warmup_steps=2",
+            "--overwrite_output_dir",
+            "--seed=42",
+        ]
+        model_type, model_name = ("--model_type=bert", "--model_name_or_path=bert-base-uncased")
+        with patch.object(sys, "argv", testargs + [model_type, model_name]):
            result = run_glue.main()
            for value in result.values():
                self.assertGreaterEqual(value, 0.75)
@@ -71,41 +67,34 @@ class ExamplesTests(unittest.TestCase):
        stream_handler = logging.StreamHandler(sys.stdout)
        logger.addHandler(stream_handler)

-        testargs = ["run_squad.py",
-                    "--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
-                    "--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
-                    "--model_name=bert-base-uncased",
-                    "--output_dir=./examples/tests_samples/temp_dir",
-                    "--max_steps=10",
-                    "--warmup_steps=2",
-                    "--do_train",
-                    "--do_eval",
-                    "--version_2_with_negative",
-                    "--learning_rate=2e-4",
-                    "--per_gpu_train_batch_size=2",
-                    "--per_gpu_eval_batch_size=1",
-                    "--overwrite_output_dir",
-                    "--seed=42"]
-        model_type, model_name = ("--model_type=bert",
-                                  "--model_name_or_path=bert-base-uncased")
-        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+        testargs = [
+            "run_squad.py",
+            "--data_dir=./examples/tests_samples/SQUAD",
+            "--model_name=bert-base-uncased",
+            "--output_dir=./examples/tests_samples/temp_dir",
+            "--max_steps=10",
+            "--warmup_steps=2",
+            "--do_train",
+            "--do_eval",
+            "--version_2_with_negative",
+            "--learning_rate=2e-4",
+            "--per_gpu_train_batch_size=2",
+            "--per_gpu_eval_batch_size=1",
+            "--overwrite_output_dir",
+            "--seed=42",
+        ]
+        model_type, model_name = ("--model_type=bert", "--model_name_or_path=bert-base-uncased")
+        with patch.object(sys, "argv", testargs + [model_type, model_name]):
            result = run_squad.main()
-            self.assertGreaterEqual(result['f1'], 30)
-            self.assertGreaterEqual(result['exact'], 30)
+            self.assertGreaterEqual(result["f1"], 30)
+            self.assertGreaterEqual(result["exact"], 30)

    def test_generation(self):
        stream_handler = logging.StreamHandler(sys.stdout)
        logger.addHandler(stream_handler)

-        testargs = ["run_generation.py",
-                    "--prompt=Hello",
-                    "--length=10",
-                    "--seed=42"]
-        model_type, model_name = ("--model_type=openai-gpt",
-                                  "--model_name_or_path=openai-gpt")
-        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+        testargs = ["run_generation.py", "--prompt=Hello", "--length=10", "--seed=42"]
+        model_type, model_name = ("--model_type=openai-gpt", "--model_name_or_path=openai-gpt")
+        with patch.object(sys, "argv", testargs + [model_type, model_name]):
            result = run_generation.main()
            self.assertGreaterEqual(len(result), 10)
-
-if __name__ == "__main__":
-    unittest.main()
--- a/examples/tests_samples/SQUAD/dev-v2.0-small.json
+++ b/examples/tests_samples/SQUAD/dev-v2.0-small.json
--- a/examples/tests_samples/SQUAD/train-v2.0.json
+++ b/examples/tests_samples/SQUAD/train-v2.0.json
@@ -0,0 +1,140 @@
+{
+    "version": "v2.0",
+    "data": [{
+        "title": "Normans",
+        "paragraphs": [{
+            "qas": [{
+                "question": "In what country is Normandy located?",
+                "id": "56ddde6b9a695914005b9628",
+                "answers": [{
+                    "text": "France",
+                    "answer_start": 159
+                }],
+                "is_impossible": false
+            }, {
+                "question": "When were the Normans in Normandy?",
+                "id": "56ddde6b9a695914005b9629",
+                "answers": [{
+                    "text": "10th and 11th centuries",
+                    "answer_start": 94
+                }],
+                "is_impossible": false
+            }, {
+                "question": "From which countries did the Norse originate?",
+                "id": "56ddde6b9a695914005b962a",
+                "answers": [{
+                    "text": "Denmark, Iceland and Norway",
+                    "answer_start": 256
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "Rollo",
+                    "answer_start": 308
+                }],
+                "question": "Who did King Charles III swear fealty to?",
+                "id": "5ad39d53604f3c001a3fe8d3",
+                "answers": [],
+                "is_impossible": true
+            }, {
+                "plausible_answers": [{
+                    "text": "10th century",
+                    "answer_start": 671
+                }],
+                "question": "When did the Frankish identity emerge?",
+                "id": "5ad39d53604f3c001a3fe8d4",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
+        }, {
+            "qas": [{
+                "question": "Who was the duke in the battle of Hastings?",
+                "id": "56dddf4066d3e219004dad5f",
+                "answers": [{
+                    "text": "William the Conqueror",
+                    "answer_start": 1022
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "Antioch",
+                    "answer_start": 1295
+                }],
+                "question": "What principality did William the conquerer found?",
+                "id": "5ad3a266604f3c001a3fea2b",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands."
+        }]
+    }, {
+        "title": "Computational_complexity_theory",
+        "paragraphs": [{
+            "qas": [{
+                "question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
+                "id": "56e16182e3433e1400422e28",
+                "answers": [{
+                    "text": "Computational complexity theory",
+                    "answer_start": 0
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "algorithm",
+                    "answer_start": 472
+                }],
+                "question": "What is a manual application of mathematical steps?",
+                "id": "5ad5316b5b96ef001a10ab76",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
+        }, {
+            "qas": [{
+                "question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?",
+                "id": "56e16839cd28a01900c67887",
+                "answers": [{
+                    "text": "if its solution requires significant resources",
+                    "answer_start": 46
+                }],
+                "is_impossible": false
+            }, {
+                "question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?",
+                "id": "56e16839cd28a01900c67888",
+                "answers": [{
+                    "text": "mathematical models of computation",
+                    "answer_start": 176
+                }],
+                "is_impossible": false
+            }, {
+                "question": "What are two basic primary resources used to guage complexity?",
+                "id": "56e16839cd28a01900c67889",
+                "answers": [{
+                    "text": "time and storage",
+                    "answer_start": 305
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "the number of gates in a circuit",
+                    "answer_start": 436
+                }],
+                "question": "What unit is measured to determine circuit simplicity?",
+                "id": "5ad532575b96ef001a10ab7f",
+                "answers": [],
+                "is_impossible": true
+            }, {
+                "plausible_answers": [{
+                    "text": "the number of processors",
+                    "answer_start": 502
+                }],
+                "question": "What number is used in perpendicular computing?",
+                "id": "5ad532575b96ef001a10ab80",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do."
+        }]
+    }]
+}
--- a/examples/utils_multiple_choice.py
+++ b/examples/utils_multiple_choice.py
@@ -15,18 +15,16 @@
 # limitations under the License.
 """ Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension  """

-from __future__ import absolute_import, division, print_function

-
-import logging
-import os
-import sys
-from io import open
-import json
 import csv
 import glob
-import tqdm
+import json
+import logging
+import os
 from typing import List
+
+import tqdm
+
 from transformers import PreTrainedTokenizer


@@ -55,19 +53,10 @@ class InputExample(object):


 class InputFeatures(object):
-    def __init__(self,
-                 example_id,
-                 choices_features,
-                 label
-
-    ):
+    def __init__(self, example_id, choices_features, label):
        self.example_id = example_id
        self.choices_features = [
-            {
-                'input_ids': input_ids,
-                'input_mask': input_mask,
-                'segment_ids': segment_ids
-            }
+            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
            for input_ids, input_mask, segment_ids in choices_features
        ]
        self.label = label
@@ -99,29 +88,29 @@ class RaceProcessor(DataProcessor):
    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} train".format(data_dir))
-        high = os.path.join(data_dir, 'train/high')
-        middle = os.path.join(data_dir, 'train/middle')
+        high = os.path.join(data_dir, "train/high")
+        middle = os.path.join(data_dir, "train/middle")
        high = self._read_txt(high)
        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, 'train')
+        return self._create_examples(high + middle, "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
-        high = os.path.join(data_dir, 'dev/high')
-        middle = os.path.join(data_dir, 'dev/middle')
+        high = os.path.join(data_dir, "dev/high")
+        middle = os.path.join(data_dir, "dev/middle")
        high = self._read_txt(high)
        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, 'dev')
+        return self._create_examples(high + middle, "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} test".format(data_dir))
-        high = os.path.join(data_dir, 'test/high')
-        middle = os.path.join(data_dir, 'test/middle')
+        high = os.path.join(data_dir, "test/high")
+        middle = os.path.join(data_dir, "test/middle")
        high = self._read_txt(high)
        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, 'test')
+        return self._create_examples(high + middle, "test")

    def get_labels(self):
        """See base class."""
@@ -131,13 +120,12 @@ class RaceProcessor(DataProcessor):
        lines = []
        files = glob.glob(input_dir + "/*txt")
        for file in tqdm.tqdm(files, desc="read files"):
-            with open(file, 'r', encoding='utf-8') as fin:
+            with open(file, "r", encoding="utf-8") as fin:
                data_raw = json.load(fin)
                data_raw["race_id"] = file
                lines.append(data_raw)
        return lines

-
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
@@ -145,19 +133,22 @@ class RaceProcessor(DataProcessor):
            race_id = "%s-%s" % (set_type, data_raw["race_id"])
            article = data_raw["article"]
            for i in range(len(data_raw["answers"])):
-                truth = str(ord(data_raw['answers'][i]) - ord('A'))
-                question = data_raw['questions'][i]
-                options = data_raw['options'][i]
+                truth = str(ord(data_raw["answers"][i]) - ord("A"))
+                question = data_raw["questions"][i]
+                options = data_raw["options"][i]

                examples.append(
                    InputExample(
                        example_id=race_id,
                        question=question,
-                        contexts=[article, article, article, article], # this is not efficient but convenient
+                        contexts=[article, article, article, article],  # this is not efficient but convenient
                        endings=[options[0], options[1], options[2], options[3]],
-                        label=truth))
+                        label=truth,
+                    )
+                )
        return examples

+
 class SwagProcessor(DataProcessor):
    """Processor for the SWAG data set."""

@@ -179,27 +170,19 @@ class SwagProcessor(DataProcessor):
            "setting!"
        )
        return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
+
    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3"]

    def _read_csv(self, input_file):
-        with open(input_file, 'r', encoding='utf-8') as f:
-            reader = csv.reader(f)
-            lines = []
-            for line in reader:
-                if sys.version_info[0] == 2:
-                    line = list(unicode(cell, 'utf-8') for cell in line)
-                lines.append(line)
-            return lines
-
+        with open(input_file, "r", encoding="utf-8") as f:
+            return list(csv.reader(f))

    def _create_examples(self, lines: List[List[str]], type: str):
        """Creates examples for the training and dev sets."""
-        if type == "train" and lines[0][-1] != 'label':
-            raise ValueError(
-                "For training, the input file must contain a label column."
-            )
+        if type == "train" and lines[0][-1] != "label":
+            raise ValueError("For training, the input file must contain a label column.")

        examples = [
            InputExample(
@@ -207,10 +190,11 @@ class SwagProcessor(DataProcessor):
                question=line[5],  # in the swag dataset, the
                # common beginning of each
                # choice is stored in "sent2".
-                contexts = [line[4], line[4], line[4], line[4]],
-                endings = [line[7], line[8], line[9], line[10]],
-                label=line[11]
-            ) for line in lines[1:]  # we skip the line with the column names
+                contexts=[line[4], line[4], line[4], line[4]],
+                endings=[line[7], line[8], line[9], line[10]],
+                label=line[11],
+            )
+            for line in lines[1:]  # we skip the line with the column names
        ]

        return examples
@@ -238,15 +222,14 @@ class ArcProcessor(DataProcessor):
        return ["0", "1", "2", "3"]

    def _read_json(self, input_file):
-        with open(input_file, 'r', encoding='utf-8') as fin:
+        with open(input_file, "r", encoding="utf-8") as fin:
            lines = fin.readlines()
            return lines

-
    def _create_examples(self, lines, type):
        """Creates examples for the training and dev sets."""

-        #There are two types of labels. They should be normalized
+        # There are two types of labels. They should be normalized
        def normalize(truth):
            if truth in "ABCD":
                return ord(truth) - ord("A")
@@ -283,12 +266,18 @@ class ArcProcessor(DataProcessor):
            if len(options) == 4:
                examples.append(
                    InputExample(
-                        example_id = id,
+                        example_id=id,
                        question=question,
-                        contexts=[options[0]["para"].replace("_", ""), options[1]["para"].replace("_", ""),
-                                  options[2]["para"].replace("_", ""), options[3]["para"].replace("_", "")],
+                        contexts=[
+                            options[0]["para"].replace("_", ""),
+                            options[1]["para"].replace("_", ""),
+                            options[2]["para"].replace("_", ""),
+                            options[3]["para"].replace("_", ""),
+                        ],
                        endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
-                        label=truth))
+                        label=truth,
+                    )
+                )

        if type == "train":
            assert len(examples) > 1
@@ -316,7 +305,7 @@ def convert_examples_to_features(
    Loads a data file into a list of `InputFeatures`
    """

-    label_map = {label : i for i, label in enumerate(label_list)}
+    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
@@ -331,16 +320,13 @@ def convert_examples_to_features(
            else:
                text_b = example.question + " " + ending

-            inputs = tokenizer.encode_plus(
-                text_a,
-                text_b,
-                add_special_tokens=True,
-                max_length=max_length,
-            )
-            if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0:
-                logger.info('Attention! you are cropping tokens (swag task is ok). '
-                        'If you are training ARC and RACE and you are poping question + options,'
-                        'you need to try to use a bigger max seq length!')
+            inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=max_length,)
+            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
+                logger.info(
+                    "Attention! you are cropping tokens (swag task is ok). "
+                    "If you are training ARC and RACE and you are poping question + options,"
+                    "you need to try to use a bigger max seq length!"
+                )

            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

@@ -364,7 +350,6 @@ def convert_examples_to_features(
            assert len(token_type_ids) == max_length
            choices_features.append((input_ids, attention_mask, token_type_ids))

-
        label = label_map[example.label]

        if ex_index < 2:
@@ -372,33 +357,17 @@ def convert_examples_to_features(
            logger.info("race_id: {}".format(example.example_id))
            for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
                logger.info("choice: {}".format(choice_idx))
-                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
-                logger.info("attention_mask: {}".format(' '.join(map(str, attention_mask))))
-                logger.info("token_type_ids: {}".format(' '.join(map(str, token_type_ids))))
+                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
+                logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask))))
+                logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids))))
                logger.info("label: {}".format(label))

-        features.append(
-            InputFeatures(
-                example_id=example.example_id,
-                choices_features=choices_features,
-                label=label,
-            )
-        )
+        features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,))

    return features


+processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor}


-processors = {
-    "race": RaceProcessor,
-    "swag": SwagProcessor,
-    "arc": ArcProcessor
-}
-
-
-MULTIPLE_CHOICE_TASKS_NUM_LABELS = {
-    "race", 4,
-    "swag", 4,
-    "arc", 4
-}
+MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4}
--- a/Show More
+++ b/Show More