Release: 2.1.0

Update CTRL documentation
CTRL added to the documentation
2019-10-09 12:14:03 -04:00 · 2019-10-09 12:12:36 -04:00 · 2019-10-09 12:04:06 -04:00 · 2019-10-09 11:32:21 -04:00 · 2019-10-09 11:31:05 -04:00 · 2019-10-09 17:25:08 +02:00
190 changed files with 21125 additions and 6190 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,33 +1,77 @@
 version: 2
 jobs:
-    build_py3:
-        working_directory: ~/pytorch-transformers
+    build_py3_torch_and_tf:
+        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
+            - run: sudo pip install torch
+            - run: sudo pip install tensorflow==2.0.0-rc0
            - run: sudo pip install --progress-bar off .
            - run: sudo pip install pytest codecov pytest-cov
            - run: sudo pip install tensorboardX scikit-learn
-            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
+            - run: python -m pytest -sv ./transformers/tests/ --cov
+            - run: codecov
+    build_py3_torch:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.5
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - run: sudo pip install torch
+            - run: sudo pip install --progress-bar off .
+            - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo pip install tensorboardX scikit-learn
+            - run: python -m pytest -sv ./transformers/tests/ --cov
            - run: python -m pytest -sv ./examples/
            - run: codecov
-    build_py2:
-        working_directory: ~/pytorch-transformers
+    build_py3_tf:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.5
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - run: sudo pip install tensorflow==2.0.0-rc0
+            - run: sudo pip install --progress-bar off .
+            - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo pip install tensorboardX scikit-learn
+            - run: python -m pytest -sv ./transformers/tests/ --cov
+            - run: codecov
+    build_py2_torch:
+        working_directory: ~/transformers
        resource_class: large
        parallelism: 1
        docker:
            - image: circleci/python:2.7
        steps:
            - checkout
+            - run: sudo pip install torch
            - run: sudo pip install --progress-bar off .
            - run: sudo pip install pytest codecov pytest-cov
-            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
+            - run: python -m pytest -sv ./transformers/tests/ --cov
+            - run: codecov
+    build_py2_tf:
+        working_directory: ~/transformers
+        resource_class: large
+        parallelism: 1
+        docker:
+            - image: circleci/python:2.7
+        steps:
+            - checkout
+            - run: sudo pip install tensorflow==2.0.0-rc0
+            - run: sudo pip install --progress-bar off .
+            - run: sudo pip install pytest codecov pytest-cov
+            - run: python -m pytest -sv ./transformers/tests/ --cov
            - run: codecov
    deploy_doc:
-        working_directory: ~/pytorch-transformers
+        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        steps:
@@ -47,6 +91,9 @@ workflows:
    version: 2
    build_and_test:
        jobs:
-            - build_py3
-            - build_py2
+            - build_py3_torch_and_tf
+            - build_py3_torch
+            - build_py3_tf
+            - build_py2_torch
+            - build_py2_tf
            - deploy_doc: *workflow_filters
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,5 @@
 [run]
-source=pytorch_transformers
+source=transformers
 omit =
    # skip convertion scripts from testing for now
    */convert_*
--- a/.github/ISSUE_TEMPLATE/--new-model-addition.md
+++ b/.github/ISSUE_TEMPLATE/--new-model-addition.md
@@ -0,0 +1,23 @@
+---
+name: "\U0001F31FNew model addition"
+about: Submit a proposal/request to implement a new Transformer-based model
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+# 🌟New model addition
+
+## Model description
+
+<!-- Important information -->
+
+## Open Source status
+
+* [ ] the model implementation is available: (give details)
+* [ ] the model weights are available: (give details)
+
+## Additional context
+
+<!-- Add any other context about the problem here. -->
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -1,6 +1,10 @@
 ---
 name: "\U0001F41B Bug Report"
 about: Submit a bug report to help us improve PyTorch Transformers
+title: ''
+labels: ''
+assignees: ''
+
 ---

 ## 🐛 Bug
@@ -45,4 +49,4 @@ Steps to reproduce the behavior:

 ## Additional context

-<!-- Add any other context about the problem here. -->
+<!-- Add any other context about the problem here. -->
--- a/.github/ISSUE_TEMPLATE/feature-request.md
+++ b/.github/ISSUE_TEMPLATE/feature-request.md
@@ -1,6 +1,10 @@
 ---
 name: "\U0001F680 Feature Request"
 about: Submit a proposal/request for a new PyTorch Transformers feature
+title: ''
+labels: ''
+assignees: ''
+
 ---

 ## 🚀 Feature
@@ -13,4 +17,4 @@ about: Submit a proposal/request for a new PyTorch Transformers feature

 ## Additional context

-<!-- Add any other context or screenshots about the feature request here. -->
+<!-- Add any other context or screenshots about the feature request here. -->
--- a/.github/ISSUE_TEMPLATE/migration.md
+++ b/.github/ISSUE_TEMPLATE/migration.md
@@ -1,6 +1,10 @@
 ---
 name: "\U0001F4DA Migration from PyTorch-pretrained-Bert"
-about: Report a problem when migrating from PyTorch-pretrained-Bert to PyTorch-Transformers
+about: Report a problem when migrating from PyTorch-pretrained-Bert to Transformers
+title: ''
+labels: ''
+assignees: ''
+
 ---

 ## 📚 Migration
@@ -40,4 +44,4 @@ Details of the issue:

 ## Additional context

-<!-- Add any other context about the problem here. -->
+<!-- Add any other context about the problem here. -->
--- a/.github/ISSUE_TEMPLATE/question-help.md
+++ b/.github/ISSUE_TEMPLATE/question-help.md
@@ -1,8 +1,12 @@
 ---
 name: "❓Questions & Help"
 about: Start a general discussion related to PyTorch Transformers
+title: ''
+labels: ''
+assignees: ''
+
 ---

 ## ❓ Questions & Help

-<!-- A clear and concise description of the question. -->
+<!-- A clear and concise description of the question. -->
--- a/.gitignore
+++ b/.gitignore
@@ -118,6 +118,9 @@ dmypy.json
 # vscode
 .vscode

+# Pycharm
+.idea
+
 # TF code
 tensorflow_code

@@ -130,4 +133,8 @@ runs
 examples/runs

 # data
-data
+/data
+serialization_dir
+
+# emacs
+*.*~
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,175 @@
+# How to contribute to transformers?
+
+Everyone is welcome to contribute, and we value everybody's contribution. Code
+is thus not the only way to help the community. Answering questions, helping
+others, reaching out and improving the documentations are immensely valuable to
+the community.
+
+It also helps us if you spread the word: reference the library from blog posts
+on the awesome projects it made possible, shout out on Twitter every time it has
+helped you, or simply star the repo to say "thank you".
+
+## You can contribute in so many ways!
+
+There are 4 ways you can contribute to transformers:
+* Fixing outstanding issues with the existing code;
+* Implementing new models;
+* Contributing to the examples or to the documentation;
+* Submitting issues related to bugs or desired new features.
+
+*All are equally valuable to the community.*
+
+## Submitting a new issue or feature request
+
+Do your best to follow these guidelines when submitting an issue or a feature
+request. It will make it easier for us to come back to you quickly and with good
+feedback.
+
+### Did you find a bug?
+
+The transformers are robust and reliable thanks to the users who notify us of
+the problems they encounter. So thank you for reporting an issue.
+
+First, we would really appreciate it if you could **make sure the bug was not
+already reported** (use the search bar on Github under Issues).
+
+Did not find it? :( So we can act quickly on it, please follow these steps:
+
+* Include your **OS type and version**, the versions of **Python**, **PyTorch** and
+  **Tensorflow** when applicable;
+* A short, self-contained, code snippet that allows us to reproduce the bug in
+  less than 30s;
+* Provide the *full* traceback if an exception is raised.
+
+To get the OS and software versions, execute the following code and copy-paste
+the output:
+
+```
+import platform; print("Platform", platform.platform())
+import sys; print("Python", sys.version)
+import torch; print("PyTorch", torch.__version__)
+import tensorflow; print("Tensorflow", tensorflow.__version__)
+```
+
+### Do you want to implement a new model?
+
+Awesome! Please provide the following information:
+
+* Short description of the model and link to the paper;
+* Link to the implementation if it is open-source;
+* Link to the model weights if they are available.
+
+If you are willing to contribute the model yourself, let us know so we can best
+guide you.
+
+### Do you want a new feature (that is not a model)?
+
+A world-class feature request addresses the following points:
+
+1. Motivation first:
+  * Is it related to a problem/frustration with the library? If so, please explain
+    why. Providing a code snippet that demonstrates the problem is best.
+  * Is it related to something you would need for a project? We'd love to hear
+    about it!
+  * Is it something you worked on and think could benefit the community?
+    Awesome! Tell us what problem it solved for you.
+2. Write a *full paragraph* describing the feature;
+3. Provide a **code snippet** that demonstrates its future use;
+4. In case this is related to a paper, please attach a link;
+5. Attach any additional information (drawings, screenshots, etc.) you think may help.
+
+If your issue is well written we're already 80% of the way there by the time you
+post it.
+
+## Start contributing! (Pull Requests)
+
+Before writing code, we strongly advise you to search through the exising PRs or
+issues to make sure that nobody is already working on the same thing. If you are
+unsure, it is always a good idea to open an issue to get some feedback.
+
+You will need basic `git` proficiency to be able to contribute to
+`transformers`. `git` is not the easiest tool to use but it has the greatest
+manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
+Git](https://git-scm.com/book/en/v2) is a very good reference.
+
+Follow these steps to start contributing:
+
+1. Fork the [repository](https://github.com/huggingface/transformers) by
+   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
+   under your github user account.
+2. Clone your fork to your local disk, and add the base repository as a remote:
+   
+   ```bash
+   $ git clone git@github.com:<your Github handle>/transformers.git
+   $ cd transformers
+   $ git remote add upstream git@github.com:huggingface/transformers.git
+   ```
+
+3. Create a new branch to hold your development changes:
+
+   ```bash
+   $ git checkout -b a-descriptive-name-for-my-changes
+   ```
+   
+   **do not** work on the `master` branch.
+   
+4. Set up a development environment by running the following command in a virtual environment:
+
+   ```bash
+   $ pip install -r requirements-dev.txt
+   ```
+
+5. Develop the features on your branch. Add changed files using `git add` and
+   then `git commit` to record your changes locally:
+   
+   ```bash
+   $ git add modified_file.py
+   $ git commit
+   ```
+   
+   Please write [good commit
+   messages](https://chris.beams.io/posts/git-commit/). It
+   is a good idea to sync your copy of the code with the original repository
+   regularly. This way you can quickly account for changes:
+   
+   ```bash
+   $ git fetch upstream
+   $ git rebase upstream/master
+   ```
+   
+   Push the changes to your account using:
+   
+   ```bash
+   $ git push -u origin a-descriptive-name-for-my-changes
+   ```
+   
+6. Once you are satisfied (**and the checklist below is happy too**), go to the
+   webpage of your fork on Github. Click on 'Pull request' to send your changes
+   to the project maintainers for review.
+   
+7. It's ok if maintainers ask you for changes. It happens to core contributors
+   too! So everyone can see the changes in the Pull request, work in your local
+   branch and push the changes to your fork. They will automatically appear in
+   the pull request.
+
+
+### Checklist
+
+1. The title of your pull request should be a summary of its contribution;
+2. If your pull request adresses an issue, please mention the issue number in
+   the pull request description to make sure they are linked (and people
+   consulting the issue know you are working on it);
+3. To indicate a work in progress please prefix the title with `[WIP]`. These
+   are useful to avoid duplicated work, and to differentiate it from PRs ready
+   to be merged;
+4. Make sure pre-existing tests still pass;
+5. Add high-coverage tests. No quality test, no merge;
+6. All public methods must have informative doctrings;
+
+
+### Style guide
+
+For documentation strings, `transformers` follows the [google
+style](https://google.github.io/styleguide/pyguide.html).
+
+#### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md)
--- a/README.md
+++ b/README.md
@@ -1,10 +1,117 @@
-# 👾 PyTorch-Transformers
+<p align="center">
+    <br>
+    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
+    <br>
+<p>
+<p align="center">
+    <a href="https://circleci.com/gh/huggingface/transformers">
+        <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/master">
+    </a>
+    <a href="https://github.com/huggingface/transformers/blob/master/LICENSE">
+        <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
+    </a>
+    <a href="https://huggingface.co/transformers/index.html">
+        <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/transformers/index.html.svg?down_color=red&down_message=offline&up_message=online">
+    </a>
+    <a href="https://github.com/huggingface/transformers/releases">
+        <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
+    </a>
+</p>

-[![CircleCI](https://circleci.com/gh/huggingface/pytorch-transformers.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-transformers)
+<h3 align="center">
+<p>State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
+</h3>

-PyTorch-Transformers (formerly known as `pytorch-pretrained-bert`) is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
+🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet, CTRL...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.

-The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
+### Features
+
+- As easy to use as pytorch-transformers
+- As powerful and concise as Keras
+- High performance on NLU and NLG tasks
+- Low barrier to entry for educators and practitioners
+
+State-of-the-art NLP for everyone
+- Deep learning researchers
+- Hands-on practitioners
+- AI/ML/NLP teachers and educators
+
+Lower compute costs, smaller carbon footprint
+- Researchers can share trained models instead of always retraining
+- Practitioners can reduce compute time and production costs
+- 8 architectures with over 30 pretrained models, some in more than 100 languages
+
+Choose the right framework for every part of a model's lifetime
+- Train state-of-the-art models in 3 lines of code
+- Deep interoperability between TensorFlow 2.0 and PyTorch models
+- Move a single model between TF2.0/PyTorch frameworks at will
+- Seamlessly pick the right framework for training, evaluation, production
+
+
+| Section | Description |
+|-|-|
+| [Installation](#installation) | How to install the package |
+| [Model architectures](#model-architectures) | Architectures (with pretrained weights) |
+| [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
+| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
+| [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
+| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
+| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
+| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
+| [Documentation](https://huggingface.co/transformers/) | Full API documentation and more |
+
+## Installation
+
+This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+), PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
+
+### With pip
+
+First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
+
+When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
+
+```bash
+pip install transformers
+```
+
+### From source
+
+Here also, you first need to install one of, or both, TensorFlow 2.0 and PyTorch.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
+
+When TensorFlow 2.0 and/or PyTorch has been installed, you can install from source by cloning the repository and running:
+
+```bash
+pip install [--editable] .
+```
+
+### Tests
+
+A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
+
+These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
+
+Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
+
+You can run the tests from the root of the cloned repository with the commands:
+
+```bash
+python -m pytest -sv ./transformers/tests/
+python -m pytest -sv ./examples/
+```
+
+### Do you want to run a Transformer model on a mobile device?
+
+You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
+
+It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
+
+At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models to productizing them in CoreML, or prototype a model or an app in CoreML then research its hyperparameters or architecture from TensorFlow 2.0 and/or PyTorch. Super exciting!
+
+## Model architectures
+
+🤗 Transformers currently provides 8 NLU/NLG architectures:

 1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
@@ -13,80 +120,42 @@ The library currently contains PyTorch implementations, pre-trained model weight
 5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. **[DistilBERT](https://github.com/huggingface/pytorch-transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5
-) by Victor Sanh, Lysandre Debut and Thomas Wolf.
+8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation).
+9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.

-These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/pytorch-transformers/examples.html).
+These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).

-| Section | Description |
-|-|-|
-| [Installation](#installation) | How to install the package |
-| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
-| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
-| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers |
-| [Documentation](https://huggingface.co/pytorch-transformers/) | Full API documentation and more |
+## Online demo

-## Installation
+**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team at transformer.huggingface.co, is the official demo of this repo’s text generation capabilities.
+You can use it to experiment with completions generated by `GPT2Model`, `TransfoXLModel`, and `XLNetModel`.

-This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.0.0+
-
-### With pip
-
-PyTorch-Transformers can be installed by pip as follows:
-
-```bash
-pip install pytorch-transformers
-```
-
-### From source
-
-Clone the repository and run:
-
-```bash
-pip install [--editable] .
-```
-
-### Tests
-
-A series of tests is included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/pytorch-transformers/tree/master/examples).
-
-These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
-
-You can run the tests from the root of the cloned repository with the commands:
-
-```bash
-python -m pytest -sv ./pytorch_transformers/tests/
-python -m pytest -sv ./examples/
-```
-
-### Do you want to run a Transformer model on a mobile device?
-
-You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
-
-It contains an example of a conversion script from a Pytorch trained Transformer model (here, `GPT-2`) to a CoreML model that runs on iOS devices.
-
-At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
-or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
+> “🦄 Write with transformer is to writing what calculators are to calculus.”

+![write_with_transformer](https://transformer.huggingface.co/front/assets/thumbnail-large.png)

 ## Quick tour

-Let's do a very quick overview of PyTorch-Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/pytorch-transformers/).
+Let's do a very quick overview of the model architectures in 🤗 Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/transformers/).

 ```python
 import torch
-from pytorch_transformers import *
+from transformers import *

-# PyTorch-Transformers has a unified API
-# for 7 transformer architectures and 30 pretrained weights.
+# Transformers has a unified API
+# for 8 transformer architectures and 30 pretrained weights.
 #          Model          | Tokenizer          | Pretrained weights shortcut
-MODELS = [(BertModel,       BertTokenizer,      'bert-base-uncased'),
-          (OpenAIGPTModel,  OpenAIGPTTokenizer, 'openai-gpt'),
-          (GPT2Model,       GPT2Tokenizer,      'gpt2'),
-          (TransfoXLModel,  TransfoXLTokenizer, 'transfo-xl-wt103'),
-          (XLNetModel,      XLNetTokenizer,     'xlnet-base-cased'),
-          (XLMModel,        XLMTokenizer,       'xlm-mlm-enfr-1024'),
-          (RobertaModel,    RobertaTokenizer,   'roberta-base')]
+MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
+          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
+          (GPT2Model,       GPT2Tokenizer,       'gpt2'),
+          (CTRLModel,       CTRLTokenizer,       'ctrl'),
+          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
+          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
+          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
+          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
+          (RobertaModel,    RobertaTokenizer,    'roberta-base')]
+
+# To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`

 # Let's encode some text in a sequence of hidden-states using each model:
 for model_class, tokenizer_class, pretrained_weights in MODELS:
@@ -112,24 +181,71 @@ for model_class in BERT_MODEL_CLASSES:
    # Load pretrained model/tokenizer
    model = model_class.from_pretrained('bert-base-uncased')

-# Models can return full list of hidden-states & attentions weights at each layer
-model = model_class.from_pretrained(pretrained_weights,
-                                    output_hidden_states=True,
-                                    output_attentions=True)
-input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
-all_hidden_states, all_attentions = model(input_ids)[-2:]
+    # Models can return full list of hidden-states & attentions weights at each layer
+    model = model_class.from_pretrained(pretrained_weights,
+                                        output_hidden_states=True,
+                                        output_attentions=True)
+    input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
+    all_hidden_states, all_attentions = model(input_ids)[-2:]

-# Models are compatible with Torchscript
-model = model_class.from_pretrained(pretrained_weights, torchscript=True)
-traced_model = torch.jit.trace(model, (input_ids,))
+    # Models are compatible with Torchscript
+    model = model_class.from_pretrained(pretrained_weights, torchscript=True)
+    traced_model = torch.jit.trace(model, (input_ids,))

-# Simple serialization for models and tokenizers
-model.save_pretrained('./directory/to/save/')  # save
-model = model_class.from_pretrained('./directory/to/save/')  # re-load
-tokenizer.save_pretrained('./directory/to/save/')  # save
-tokenizer = tokenizer_class.from_pretrained('./directory/to/save/')  # re-load
+    # Simple serialization for models and tokenizers
+    model.save_pretrained('./directory/to/save/')  # save
+    model = model_class.from_pretrained('./directory/to/save/')  # re-load
+    tokenizer.save_pretrained('./directory/to/save/')  # save
+    tokenizer = BertTokenizer.from_pretrained('./directory/to/save/')  # re-load

-# SOTA examples for GLUE, SQUAD, text generation...
+    # SOTA examples for GLUE, SQUAD, text generation...
+```
+
+## Quick tour TF 2.0 training and PyTorch interoperability
+
+Let's do a quick example of how a TensorFlow 2.0 model can be trained in 12 lines of code with 🤗 Transformers and then loaded in PyTorch for fast inspection/tests.
+
+```python
+import tensorflow as tf
+import tensorflow_datasets
+from transformers import *
+
+# Load dataset, tokenizer, model from pretrained model/vocabulary
+tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
+data = tensorflow_datasets.load('glue/mrpc')
+
+# Prepare dataset for GLUE as a tf.data.Dataset instance
+train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
+valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
+train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
+valid_dataset = valid_dataset.batch(64)
+
+# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
+optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+
+# Train and evaluate using tf.keras.Model.fit()
+history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
+                    validation_data=valid_dataset, validation_steps=7)
+
+# Load the TensorFlow model in PyTorch for inspection
+model.save_pretrained('./save/')
+pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
+
+# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
+sentence_0 = "This research was consistent with his findings."
+sentence_1 = "His findings were compatible with this research."
+sentence_2 = "His findings were not compatible with this research."
+inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
+inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
+
+pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
+pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
+print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
+print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
 ```

 ## Quick tour of the fine-tuning/usage scripts
@@ -138,7 +254,7 @@ The library comprises several example scripts with SOTA performances for NLU and

 - `run_glue.py`: an example fine-tuning Bert, XLNet and XLM on nine different GLUE tasks (*sequence-level classification*)
 - `run_squad.py`: an example fine-tuning Bert, XLNet and XLM on the question answering dataset SQuAD 2.0 (*token-level classification*)
- `run_generation.py`: an example using GPT, GPT-2, Transformer-XL and XLNet for conditional language generation
+- `run_generation.py`: an example using GPT, GPT-2, CTRL, Transformer-XL and XLNet for conditional language generation
 - other model-specific examples (see the documentation).

 Here are three quick usage examples for these scripts:
@@ -276,10 +392,10 @@ python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncase

 This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-squad`.

-### `run_generation.py`: Text generation with GPT, GPT-2, Transformer-XL and XLNet
+### `run_generation.py`: Text generation with GPT, GPT-2, CTRL, Transformer-XL and XLNet

 A conditional generation script is also included to generate text from a prompt.
-The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by by Aman Rusia to get high quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
+The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by Aman Rusia to get high-quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).

 Here is how to run the script with the small version of OpenAI GPT-2 model:

@@ -290,19 +406,42 @@ python ./examples/run_generation.py \
    --model_name_or_path=gpt2 \
 ```

-## Migrating from pytorch-pretrained-bert to pytorch-transformers
+and from the Salesforce CTRL model: 
+```shell
+python ./examples/run_generation.py \
+    --model_type=ctrl \
+    --length=20 \
+    --model_name_or_path=gpt2 \
+    --temperature=0 \
+    --repetition_penalty=1.2 \
+```

-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers`
+## Migrating from pytorch-transformers to transformers
+
+Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
+
+### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
+
+To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
+
+If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
+
+If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
+
+
+## Migrating from pytorch-pretrained-bert to transformers
+
+Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`.

 ### Models always output `tuples`

-The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that every model's forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.

-The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
+The exact content of the tuples for each model is detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).

 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.

-Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model:
+Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:

 ```python
 # Let's load our model
@@ -311,11 +450,11 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 # If you used to have this line in pytorch-pretrained-bert:
 loss = model(input_ids, labels=labels)

-# Now just use this line in pytorch-transformers to extract the loss from the output tuple:
+# Now just use this line in transformers to extract the loss from the output tuple:
 outputs = model(input_ids, labels=labels)
 loss = outputs[0]

-# In pytorch-transformers you can also have access to the logits:
+# In transformers you can also have access to the logits:
 loss, logits = outputs[:2]

 # And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
@@ -324,13 +463,17 @@ outputs = model(input_ids, labels=labels)
 loss, logits, attentions = outputs
 ```

+### Using hidden states
+
+By enabling the configuration option `output_hidden_states`, it was possible to retrieve the last hidden states of the encoder. In `pytorch-transformers` as well as `transformers` the return value has changed slightly: `all_hidden_states` now also includes the hidden state of the embeddings in addition to those of the encoding layers. This allows users to easily access the embeddings final state.
+
 ### Serialization

-Breaking change in the `from_pretrained()`method:
+Breaking change in the `from_pretrained()` method:

-1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
+1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them, don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.

-2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/pytorch-transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
+2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead, which can break derived model classes built based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/transformers/pull/866) by forwarding the the model's `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.

 Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.

@@ -387,7 +530,7 @@ for batch in train_data:
    loss.backward()
    optimizer.step()

-### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
+### In Transformers, optimizer and schedules are splitted and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
 scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
 ### and used like this:
@@ -402,4 +545,4 @@ for batch in train_data:

 ## Citation

-At the moment, there is no paper associated to PyTorch-Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
+At the moment, there is no paper associated with Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -2,6 +2,6 @@ FROM pytorch/pytorch:latest

 RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext

-RUN pip install pytorch_transformers
+RUN pip install transformers

 WORKDIR /workspace
--- a/docs/README.md
+++ b/docs/README.md
@@ -34,6 +34,13 @@ pip install recommonmark

 ## Building the documentation

+Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following 
+command to generate it:
+
+```bash
+ln -s ../../examples/README.md examples.md
+```
+
 Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:

 ```bash
@@ -43,7 +50,7 @@ make html
 ---
 **NOTE**

-If you are adding/removing elements from the toc-tree or from any strutural item, it is recommended to clean the build
+If you are adding/removing elements from the toc-tree or from any structural item, it is recommended to clean the build
 directory before rebuilding. Run the following command to clean and build:

 ```bash
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -26,3 +26,7 @@ sphinxcontrib-jsmath==1.0.1
 sphinxcontrib-qthelp==1.0.2
 sphinxcontrib-serializinghtml==1.1.3
 urllib3==1.25.3
+sphinx-markdown-tables==0.0.9
+numpy==1.17.2
+tensorflow==2.0.0rc2
+torch==1.2.0
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -1,5 +1,3 @@
-huggingface.css
-
 /* The literal code blocks */
 .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
    color: #6670FF;
@@ -44,11 +42,11 @@ huggingface.css
 /* The text items on the toc tree */
 .wy-menu-vertical a {
    color: #FFFFDD;
-    font-family: Calibre-Light;
+    font-family: Calibre-Light, sans-serif;
 }
 .wy-menu-vertical header, .wy-menu-vertical p.caption{
    color: white;
-    font-family: Calibre-Light;
+    font-family: Calibre-Light, sans-serif;
 }

 /* The color inside the selected toc tree block */
@@ -85,7 +83,7 @@ a {
    border-right: solid 2px #FB8D68;
    border-left: solid 2px #FB8D68;
    color: #FB8D68;
-    font-family: Calibre-Light;
+    font-family: Calibre-Light, sans-serif;
    border-top: none;
    font-style: normal !important;
 }
@@ -136,14 +134,14 @@ a {

 /* class and method names in doc */
 .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) code.descclassname{
-    font-family: Calibre;
+    font-family: Calibre, sans-serif;
    font-size: 20px !important;
 }

 /* class name in doc*/
 .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname{
    margin-right: 10px;
-    font-family: Calibre-Medium;
+    font-family: Calibre-Medium, sans-serif;
 }

 /* Method and class parameters */
@@ -160,17 +158,17 @@ a {

 /* FONTS */
 body{
-    font-family: Calibre;
+    font-family: Calibre, sans-serif;
    font-size: 16px;
 }

 h1 {
-    font-family: Calibre-Thin;
+    font-family: Calibre-Thin, sans-serif;
    font-size: 70px;
 }

 h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{
-    font-family: Calibre-Medium;
+    font-family: Calibre-Medium, sans-serif;
 }

@font-face {
@@ -196,4 +194,3 @@ h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{
    src: url(./Calibre-Thin.otf);
    font-weight:400;
 }
-
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
--- a/docs/source/bertology.rst
+++ b/docs/source/bertology.rst
@@ -15,4 +15,4 @@ In order to help this new field develop, we have included a few additional featu
 * accessing all the attention weights for each head of BERT/GPT/GPT-2,
 * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.

-To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
+To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -19,14 +19,14 @@ sys.path.insert(0, os.path.abspath('../..'))

 # -- Project information -----------------------------------------------------

-project = u'pytorch-transformers'
+project = u'transformers'
 copyright = u'2019, huggingface'
 author = u'huggingface'

 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'1.0.0'
+release = u'2.1.0'


 # -- General configuration ---------------------------------------------------
@@ -43,7 +43,8 @@ extensions = [
    'sphinx.ext.coverage',
    'sphinx.ext.napoleon',
    'recommonmark',
-    'sphinx.ext.viewcode'
+    'sphinx.ext.viewcode',
+    'sphinx_markdown_tables'
 ]

 # Add any paths that contain templates here, relative to this directory.
@@ -108,7 +109,7 @@ html_static_path = ['_static']
 # -- Options for HTMLHelp output ---------------------------------------------

 # Output file base name for HTML help builder.
-htmlhelp_basename = 'pytorch-transformersdoc'
+htmlhelp_basename = 'transformersdoc'


 # -- Options for LaTeX output ------------------------------------------------
@@ -135,7 +136,7 @@ latex_elements = {
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'pytorch-transformers.tex', u'pytorch-transformers Documentation',
+    (master_doc, 'transformers.tex', u'transformers Documentation',
     u'huggingface', 'manual'),
 ]

@@ -145,7 +146,7 @@ latex_documents = [
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    (master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
+    (master_doc, 'transformers', u'transformers Documentation',
     [author], 1)
 ]

@@ -156,8 +157,8 @@ man_pages = [
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
-     author, 'pytorch-transformers', 'One line description of project.',
+    (master_doc, 'transformers', u'transformers Documentation',
+     author, 'transformers', 'One line description of project.',
     'Miscellaneous'),
 ]

--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -6,7 +6,7 @@ A command-line interface is provided to convert original Bert/GPT/GPT-2/Transfor
 BERT
 ^^^^

-You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
+You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.

 This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).

@@ -20,7 +20,7 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas

   export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12

-   pytorch_transformers bert \
+   transformers bert \
     $BERT_BASE_DIR/bert_model.ckpt \
     $BERT_BASE_DIR/bert_config.json \
     $BERT_BASE_DIR/pytorch_model.bin
@@ -36,7 +36,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,

   export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights

-   pytorch_transformers gpt \
+   transformers gpt \
     $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
     $PYTORCH_DUMP_OUTPUT \
     [OPENAI_GPT_CONFIG]
@@ -50,7 +50,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT-2 mode

   export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights

-   pytorch_transformers gpt2 \
+   transformers gpt2 \
     $OPENAI_GPT2_CHECKPOINT_PATH \
     $PYTORCH_DUMP_OUTPUT \
     [OPENAI_GPT2_CONFIG]
@@ -64,7 +64,7 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo

   export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint

-   pytorch_transformers transfo_xl \
+   transformers transfo_xl \
     $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
     $PYTORCH_DUMP_OUTPUT \
     [TRANSFO_XL_CONFIG]
@@ -80,7 +80,7 @@ Here is an example of the conversion process for a pre-trained XLNet model, fine
   export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
   export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config

-   pytorch_transformers xlnet \
+   transformers xlnet \
     $TRANSFO_XL_CHECKPOINT_PATH \
     $TRANSFO_XL_CONFIG_PATH \
     $PYTORCH_DUMP_OUTPUT \
@@ -96,6 +96,6 @@ Here is an example of the conversion process for a pre-trained XLM model:

   export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint

-   pytorch_transformers xlm \
+   transformers xlm \
     $XLM_CHECKPOINT_PATH \
     $PYTORCH_DUMP_OUTPUT \
--- a/docs/source/examples.md
+++ b/docs/source/examples.md
@@ -0,0 +1 @@
+../../examples/README.md
--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
@@ -1,686 +0,0 @@
-examples.rst
-
-Examples
-================================================
-
-.. list-table::
-   :header-rows: 1
-
-   * - Sub-section
-     - Description
-   * - `Training large models: introduction, tools and examples <#introduction>`_
-     - How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models
-   * - `Fine-tuning with BERT: running the examples <#fine-tuning-bert-examples>`_
-     - Running the examples in `examples <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples>`_\ : ``extract_classif.py``\ , ``run_bert_classifier.py``\ , ``run_bert_squad.py`` and ``run_lm_finetuning.py``
-   * - `Fine-tuning with OpenAI GPT, Transformer-XL, GPT-2 as well as BERT and RoBERTa <#fine-tuning>`_
-     - Running the examples in `examples <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples>`_\ : ``run_openai_gpt.py``\ , ``run_transfo_xl.py``, ``run_gpt2.py`` and ``run_lm_finetuning.py``
-   * - `Fine-tuning BERT-large on GPUs <#fine-tuning-bert-large>`_
-     - How to fine tune ``BERT large``
-
-
-.. _introduction:
-
-Training large models: introduction, tools and examples
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-BERT-base and BERT-large are respectively 110M and 340M parameters models and it can be difficult to fine-tune them on a single GPU with the recommended batch size for good performance (in most case a batch size of 32).
-
-To help with fine-tuning these models, we have included several techniques that you can activate in the fine-tuning scripts `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ : gradient-accumulation, multi-gpu training, distributed training and 16-bits training . For more details on how to use these techniques you can read `the tips on training large batches in PyTorch <https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255>`_ that I published earlier this year.
-
-Here is how to use these techniques in our scripts:
-
-
-* **Gradient Accumulation**\ : Gradient accumulation can be used by supplying a integer greater than 1 to the ``--gradient_accumulation_steps`` argument. The batch at each step will be divided by this integer and gradient will be accumulated over ``gradient_accumulation_steps`` steps.
-* **Multi-GPU**\ : Multi-GPU is automatically activated when several GPUs are detected and the batches are splitted over the GPUs.
-* **Distributed training**\ : Distributed training can be activated by supplying an integer greater or equal to 0 to the ``--local_rank`` argument (see below).
-* **16-bits training**\ : 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found `here <https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/>`__ and a full documentation is `here <https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html>`__. In our scripts, this option can be activated by setting the ``--fp16`` flag and you can play with loss scaling using the ``--loss_scale`` flag (see the previously linked documentation for details on loss scaling). The loss scale can be zero in which case the scale is dynamically adjusted or a positive power of two in which case the scaling is static.
-
-To use 16-bits training and distributed training, you need to install NVIDIA's apex extension `as detailed here <https://github.com/nvidia/apex>`__. You will find more information regarding the internals of ``apex`` and how to use ``apex`` in `the doc and the associated repository <https://github.com/nvidia/apex>`_. The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in `the relevant PR of the present repository <https://github.com/huggingface/pytorch-pretrained-BERT/pull/116>`_.
-
-Note: To use *Distributed Training*\ , you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see `the above mentioned blog post <https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255>`_\ ) for more details):
-
-.. code-block:: bash
-
-    python -m torch.distributed.launch \
-        --nproc_per_node=4 \
-        --nnodes=2 \
-        --node_rank=$THIS_MACHINE_INDEX \
-        --master_addr="192.168.1.1" \
-        --master_port=1234 run_bert_classifier.py \
-        (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
-
-Where ``$THIS_MACHINE_INDEX`` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address ``192.168.1.1`` and an open port ``1234``.
-
-.. _fine-tuning-bert-examples:
-
-Fine-tuning with BERT: running the examples
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-We showcase several fine-tuning examples based on (and extended from) `the original implementation <https://github.com/google-research/bert/>`_\ :
-
-
-* a *sequence-level classifier* on nine different GLUE tasks,
-* a *token-level classifier* on the question answering dataset SQuAD, and
-* a *sequence-level multiple-choice classifier* on the SWAG classification corpus.
-* a *BERT language model* on another target corpus
-
-GLUE results on dev set
-~~~~~~~~~~~~~~~~~~~~~~~
-
-We get the following results on the dev set of GLUE benchmark with an uncased BERT base
-model (`bert-base-uncased`). All experiments ran on 8 V100 GPUs with a total train batch size of 24. Some of 
-these tasks have a small dataset and training can lead to high variance in the results between different runs.
-We report the median on 5 runs (with different seeds) for each of the metrics.
-
-.. list-table::
-   :header-rows: 1
-
-   * - Task
-     - Metric
-     - Result
-   * - CoLA
-     - Matthew's corr.
-     - 55.75
-   * - SST-2
-     - accuracy
-     - 92.09
-   * - MRPC
-     - F1/accuracy
-     - 90.48/86.27
-   * - STS-B
-     - Pearson/Spearman corr.
-     - 89.03/88.64
-   * - QQP
-     - accuracy/F1
-     - 90.92/87.72
-   * - MNLI
-     - matched acc./mismatched acc.
-     - 83.74/84.06
-   * - QNLI
-     - accuracy
-     - 91.07
-   * - RTE
-     - accuracy
-     - 68.59
-   * - WNLI
-     - accuracy
-     - 43.66
-
-
-Some of these results are significantly different from the ones reported on the test set
-of GLUE benchmark on the website. For QQP and WNLI, please refer to `FAQ #12 <https://gluebenchmark.com/faq>`_ on the webite.
-
-Before running anyone of these GLUE tasks you should download the
-`GLUE data <https://gluebenchmark.com/tasks>`_ by running
-`this script <https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e>`_
-and unpack it to some directory ``$GLUE_DIR``.
-
-.. code-block:: shell
-
-   export GLUE_DIR=/path/to/glue
-   export TASK_NAME=MRPC
-
-   python run_bert_classifier.py \
-     --task_name $TASK_NAME \
-     --do_train \
-     --do_eval \
-     --do_lower_case \
-     --data_dir $GLUE_DIR/$TASK_NAME \
-     --bert_model bert-base-uncased \
-     --max_seq_length 128 \
-     --train_batch_size 32 \
-     --learning_rate 2e-5 \
-     --num_train_epochs 3.0 \
-     --output_dir /tmp/$TASK_NAME/
-
-where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
-
-The dev set results will be present within the text file 'eval_results.txt' in the specified output_dir. In case of MNLI, since there are two separate dev sets, matched and mismatched, there will be a separate output folder called '/tmp/MNLI-MM/' in addition to '/tmp/MNLI/'.
-
-The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI, CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being said, there shouldn't be any issues in running half-precision training with the remaining GLUE tasks as well, since the data processor for each task inherits from the base class DataProcessor.
-
-MRPC
-~~~~
-
-This example code fine-tunes BERT on the Microsoft Research Paraphrase
-Corpus (MRPC) corpus and runs in less than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
-
-Before running this example you should download the
-`GLUE data <https://gluebenchmark.com/tasks>`_ by running
-`this script <https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e>`_
-and unpack it to some directory ``$GLUE_DIR``.
-
-.. code-block:: shell
-
-   export GLUE_DIR=/path/to/glue
-
-   python run_bert_classifier.py \
-     --task_name MRPC \
-     --do_train \
-     --do_eval \
-     --do_lower_case \
-     --data_dir $GLUE_DIR/MRPC/ \
-     --bert_model bert-base-uncased \
-     --max_seq_length 128 \
-     --train_batch_size 32 \
-     --learning_rate 2e-5 \
-     --num_train_epochs 3.0 \
-     --output_dir /tmp/mrpc_output/
-
-Our test ran on a few seeds with `the original implementation hyper-parameters <https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks>`__ gave evaluation results between 84% and 88%.
-
-**Fast run with apex and 16 bit precision: fine-tuning on MRPC in 27 seconds!**
-First install apex as indicated `here <https://github.com/NVIDIA/apex>`__.
-Then run
-
-.. code-block:: shell
-
-   export GLUE_DIR=/path/to/glue
-
-   python run_bert_classifier.py \
-     --task_name MRPC \
-     --do_train \
-     --do_eval \
-     --do_lower_case \
-     --data_dir $GLUE_DIR/MRPC/ \
-     --bert_model bert-base-uncased \
-     --max_seq_length 128 \
-     --train_batch_size 32 \
-     --learning_rate 2e-5 \
-     --num_train_epochs 3.0 \
-     --output_dir /tmp/mrpc_output/ \
-     --fp16
-
-**Distributed training**
-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking model to reach a F1 > 92 on MRPC:
-
-.. code-block:: bash
-
-    python -m torch.distributed.launch \
-        --nproc_per_node 8 run_bert_classifier.py \
-        --bert_model bert-large-uncased-whole-word-masking \
-        --task_name MRPC \
-        --do_train \
-        --do_eval \
-        --do_lower_case \
-        --data_dir $GLUE_DIR/MRPC/ \
-        --max_seq_length 128 \
-        --train_batch_size 8 \
-        --learning_rate 2e-5 \
-        --num_train_epochs 3.0 \
-         --output_dir /tmp/mrpc_output/
-
-Training with these hyper-parameters gave us the following results:
-
-.. code-block:: bash
-
-     acc = 0.8823529411764706
-     acc_and_f1 = 0.901702786377709
-     eval_loss = 0.3418912578906332
-     f1 = 0.9210526315789473
-     global_step = 174
-     loss = 0.07231863956341798
-
-Here is an example on MNLI:
-
-.. code-block:: bash
-
-    python -m torch.distributed.launch \
-        --nproc_per_node 8 run_bert_classifier.py \
-        --bert_model bert-large-uncased-whole-word-masking \
-        --task_name mnli \
-        --do_train \
-        --do_eval \
-        --do_lower_case \
-        --data_dir /datadrive/bert_data/glue_data//MNLI/ \
-        --max_seq_length 128 \
-        --train_batch_size 8 \
-        --learning_rate 2e-5 \
-        --num_train_epochs 3.0 \
-        --output_dir ../models/wwm-uncased-finetuned-mnli/ \
-        --overwrite_output_dir
-
-.. code-block:: bash
-
-   ***** Eval results *****
-     acc = 0.8679706601466992
-     eval_loss = 0.4911287787382479
-     global_step = 18408
-     loss = 0.04755385363816904
-
-   ***** Eval results *****
-     acc = 0.8747965825874695
-     eval_loss = 0.45516540421714036
-     global_step = 18408
-     loss = 0.04755385363816904
-
-This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model
-
-SQuAD
-~~~~~
-
-This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.
-
-The data for SQuAD can be downloaded with the following links and should be saved in a ``$SQUAD_DIR`` directory.
-
-
-* `train-v1.1.json <https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json>`_
-* `dev-v1.1.json <https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json>`_
-* `evaluate-v1.1.py <https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py>`_
-
-.. code-block:: shell
-
-   export SQUAD_DIR=/path/to/SQUAD
-
-   python run_bert_squad.py \
-     --bert_model bert-base-uncased \
-     --do_train \
-     --do_predict \
-     --do_lower_case \
-     --train_file $SQUAD_DIR/train-v1.1.json \
-     --predict_file $SQUAD_DIR/dev-v1.1.json \
-     --train_batch_size 12 \
-     --learning_rate 3e-5 \
-     --num_train_epochs 2.0 \
-     --max_seq_length 384 \
-     --doc_stride 128 \
-     --output_dir /tmp/debug_squad/
-
-Training with the previous hyper-parameters gave us the following results:
-
-.. code-block:: bash
-
-   python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json /tmp/debug_squad/predictions.json
-   {"f1": 88.52381567990474, "exact_match": 81.22043519394512}
-
-**distributed training**
-
-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
-
-.. code-block:: bash
-
-   python -m torch.distributed.launch --nproc_per_node=8 \
-    run_bert_squad.py \
-    --bert_model bert-large-uncased-whole-word-masking  \
-    --do_train \
-    --do_predict \
-    --do_lower_case \
-    --train_file $SQUAD_DIR/train-v1.1.json \
-    --predict_file $SQUAD_DIR/dev-v1.1.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ../models/wwm_uncased_finetuned_squad/ \
-    --train_batch_size 24 \
-    --gradient_accumulation_steps 12
-
-Training with these hyper-parameters gave us the following results:
-
-.. code-block:: bash
-
-   python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
-   {"exact_match": 86.91579943235573, "f1": 93.1532499015869}
-
-This is the model provided as ``bert-large-uncased-whole-word-masking-finetuned-squad``.
-
-And here is the model provided as ``bert-large-cased-whole-word-masking-finetuned-squad``\ :
-
-.. code-block:: bash
-
-    python -m torch.distributed.launch --nproc_per_node=8  run_bert_squad.py \
-        --bert_model bert-large-cased-whole-word-masking \
-        --do_train \
-        --do_predict \
-        --do_lower_case \
-        --train_file $SQUAD_DIR/train-v1.1.json \
-        --predict_file $SQUAD_DIR/dev-v1.1.json \
-        --learning_rate 3e-5 \
-        --num_train_epochs 2 \
-        --max_seq_length 384 \
-        --doc_stride 128 \
-        --output_dir ../models/wwm_cased_finetuned_squad/ \
-        --train_batch_size 24 \
-        --gradient_accumulation_steps 12
-
-Training with these hyper-parameters gave us the following results:
-
-.. code-block:: bash
-
-   python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
-   {"exact_match": 84.18164616840113, "f1": 91.58645594850135}
-
-SWAG
-~~~~
-
-The data for SWAG can be downloaded by cloning the following `repository <https://github.com/rowanz/swagaf>`_
-
-.. code-block:: shell
-
-   export SWAG_DIR=/path/to/SWAG
-
-   python run_bert_swag.py \
-     --bert_model bert-base-uncased \
-     --do_train \
-     --do_lower_case \
-     --do_eval \
-     --data_dir $SWAG_DIR/data \
-     --train_batch_size 16 \
-     --learning_rate 2e-5 \
-     --num_train_epochs 3.0 \
-     --max_seq_length 80 \
-     --output_dir /tmp/swag_output/ \
-     --gradient_accumulation_steps 4
-
-Training with the previous hyper-parameters on a single GPU gave us the following results:
-
-.. code-block::
-
-   eval_accuracy = 0.8062081375587323
-   eval_loss = 0.5966546792367169
-   global_step = 13788
-   loss = 0.06423990014260186
-
-LM Fine-tuning
-~~~~~~~~~~~~~~
-
-The data should be a text file in the same format as `sample_text.txt <./pytorch_transformers/tests/fixtures/sample_text.txt/sample_text.txt>`_  (one sentence per line, docs separated by empty line).
-You can download an `exemplary training corpus <https://ext-bert-sample.obs.eu-de.otc.t-systems.com/small_wiki_sentence_corpus.txt>`_ generated from wikipedia articles and split into ~500k sentences with spaCy.
-Training one epoch on this corpus takes about 1:20h on 4 x NVIDIA Tesla P100 with ``train_batch_size=200`` and ``max_seq_length=128``\ :
-
-Thank to the work of @Rocketknight1 and @tholor there are now **several scripts** that can be used to fine-tune BERT using the pretraining objective (combination of masked-language modeling and next sentence prediction loss). These scripts are detailed in the `README <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/lm_finetuning/README.md>`_ of the `examples/lm_finetuning/ <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/lm_finetuning/>`_ folder.
-
-.. _fine-tuning:
-
-OpenAI GPT, Transformer-XL and GPT-2: running the examples
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-We provide three examples of scripts for OpenAI GPT, Transformer-XL, OpenAI GPT-2, BERT and RoBERTa based on (and extended from) the respective original implementations:
-
-
-* fine-tuning OpenAI GPT on the ROCStories dataset
-* evaluating Transformer-XL on Wikitext 103
-* unconditional and conditional generation from a pre-trained OpenAI GPT-2 model
-* fine-tuning GPT/GPT-2 on a causal language modeling task and BERT/RoBERTa on a masked language modeling task
-
-Fine-tuning OpenAI GPT on the RocStories dataset
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This example code fine-tunes OpenAI GPT on the RocStories dataset.
-
-Before running this example you should download the
-`RocStories dataset <https://github.com/snigdhac/StoryComprehension_EMNLP/tree/master/Dataset/RoCStories>`_ and unpack it to some directory ``$ROC_STORIES_DIR``.
-
-.. code-block:: shell
-
-   export ROC_STORIES_DIR=/path/to/RocStories
-
-   python run_openai_gpt.py \
-     --model_name openai-gpt \
-     --do_train \
-     --do_eval \
-     --train_dataset $ROC_STORIES_DIR/cloze_test_val__spring2016\ -\ cloze_test_ALL_val.csv \
-     --eval_dataset $ROC_STORIES_DIR/cloze_test_test__spring2016\ -\ cloze_test_ALL_test.csv \
-     --output_dir ../log \
-     --train_batch_size 16 \
-
-This command runs in about 10 min on a single K-80 an gives an evaluation accuracy of about 87.7% (the authors report a median accuracy with the TensorFlow code of 85.8% and the OpenAI GPT paper reports a best single run accuracy of 86.5%).
-
-Evaluating the pre-trained Transformer-XL on the WikiText 103 dataset
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This example code evaluate the pre-trained Transformer-XL on the WikiText 103 dataset.
-This command will download a pre-processed version of the WikiText 103 dataset in which the vocabulary has been computed.
-
-.. code-block:: shell
-
-   python run_transfo_xl.py --work_dir ../log
-
-This command runs in about 1 min on a V100 and gives an evaluation perplexity of 18.22 on WikiText-103 (the authors report a perplexity of about 18.3 on this dataset with the TensorFlow code).
-
-Unconditional and conditional generation from OpenAI's GPT-2 model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This example code is identical to the original unconditional and conditional generation codes.
-
-Conditional generation:
-
-.. code-block:: shell
-
-   python run_gpt2.py
-
-Unconditional generation:
-
-.. code-block:: shell
-
-   python run_gpt2.py --unconditional
-
-The same option as in the original scripts are provided, please refer to the code of the example and the original repository of OpenAI.
-
-
-Causal LM fine-tuning on GPT/GPT-2, Masked LM fine-tuning on BERT/RoBERTa
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Before running the following examples you should download the `WikiText-2 dataset <https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/>`__ and unpack it to some directory `$WIKITEXT_2_DATASET`
-The following results were obtained using the `raw` WikiText-2 (no tokens were replaced before the tokenization).
-
-This example fine-tunes GPT-2 on the WikiText-2 dataset. The loss function is a causal language modeling loss (perplexity).
-
-.. code-block:: bash
-
-
-    export WIKITEXT_2_DATASET=/path/to/wikitext_dataset
-
-    python run_lm_finetuning.py
-        --output_dir=output
-        --model_type=gpt2
-        --model_name_or_path=gpt2
-        --do_train
-        --train_data_file=$WIKITEXT_2_DATASET/wiki.train.raw
-        --do_eval
-        --eval_data_file=$WIKITEXT_2_DATASET/wiki.test.raw
-
-This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run.
-It reaches a score of about 20 perplexity once fine-tuned on the dataset.
-
-This example fine-tunes RoBERTa on the WikiText-2 dataset. The loss function is a masked language modeling loss (masked perplexity).
-The `--mlm` flag is necessary to fine-tune BERT/RoBERTa on masked language modeling.
-
-.. code-block:: bash
-
-
-    export WIKITEXT_2_DATASET=/path/to/wikitext_dataset
-
-    python run_lm_finetuning.py
-        --output_dir=output
-        --model_type=roberta
-        --model_name_or_path=roberta-base
-        --do_train
-        --train_data_file=$WIKITEXT_2_DATASET/wiki.train.raw
-        --do_eval
-        --eval_data_file=$WIKITEXT_2_DATASET/wiki.test.raw
-        --mlm
-
-.. _fine-tuning-BERT-large:
-
-Fine-tuning BERT-large on GPUs
------------------------------
-
-The options we list above allow to fine-tune BERT-large rather easily on GPU(s) instead of the TPU used by the original implementation.
-
-For example, fine-tuning BERT-large on SQuAD can be done on a server with 4 k-80 (these are pretty old now) in 18 hours. Our results are similar to the TensorFlow implementation results (actually slightly higher):
-
-.. code-block:: bash
-
-   {"exact_match": 84.56953642384106, "f1": 91.04028647786927}
-
-To get these results we used a combination of:
-
-
-* multi-GPU training (automatically activated on a multi-GPU server),
-* 2 steps of gradient accumulation and
-* perform the optimization step on CPU to store Adam's averages in RAM.
-
-Here is the full list of hyper-parameters for this run:
-
-.. code-block:: bash
-
-   export SQUAD_DIR=/path/to/SQUAD
-
-   python ./run_bert_squad.py \
-     --bert_model bert-large-uncased \
-     --do_train \
-     --do_predict \
-     --do_lower_case \
-     --train_file $SQUAD_DIR/train-v1.1.json \
-     --predict_file $SQUAD_DIR/dev-v1.1.json \
-     --learning_rate 3e-5 \
-     --num_train_epochs 2 \
-     --max_seq_length 384 \
-     --doc_stride 128 \
-     --output_dir /tmp/debug_squad/ \
-     --train_batch_size 24 \
-     --gradient_accumulation_steps 2
-
-If you have a recent GPU (starting from NVIDIA Volta series), you should try **16-bit fine-tuning** (FP16).
-
-Here is an example of hyper-parameters for a FP16 run we tried:
-
-.. code-block:: bash
-
-   export SQUAD_DIR=/path/to/SQUAD
-
-   python ./run_bert_squad.py \
-     --bert_model bert-large-uncased \
-     --do_train \
-     --do_predict \
-     --do_lower_case \
-     --train_file $SQUAD_DIR/train-v1.1.json \
-     --predict_file $SQUAD_DIR/dev-v1.1.json \
-     --learning_rate 3e-5 \
-     --num_train_epochs 2 \
-     --max_seq_length 384 \
-     --doc_stride 128 \
-     --output_dir /tmp/debug_squad/ \
-     --train_batch_size 24 \
-     --fp16 \
-     --loss_scale 128
-
-The results were similar to the above FP32 results (actually slightly higher):
-
-.. code-block:: bash
-
-   {"exact_match": 84.65468306527909, "f1": 91.238669287002}
-
-Here is an example with the recent ``bert-large-uncased-whole-word-masking``\ :
-
-.. code-block:: bash
-
-   python -m torch.distributed.launch --nproc_per_node=8 \
-     run_bert_squad.py \
-     --bert_model bert-large-uncased-whole-word-masking \
-     --do_train \
-     --do_predict \
-     --do_lower_case \
-     --train_file $SQUAD_DIR/train-v1.1.json \
-     --predict_file $SQUAD_DIR/dev-v1.1.json \
-     --learning_rate 3e-5 \
-     --num_train_epochs 2 \
-     --max_seq_length 384 \
-     --doc_stride 128 \
-     --output_dir /tmp/debug_squad/ \
-     --train_batch_size 24 \
-     --gradient_accumulation_steps 2
-
-Fine-tuning XLNet
-----------------
-
-STS-B
-~~~~~
-
-This example code fine-tunes XLNet on the STS-B corpus.
-
-Before running this example you should download the
-`GLUE data <https://gluebenchmark.com/tasks>`_ by running
-`this script <https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e>`_
-and unpack it to some directory ``$GLUE_DIR``.
-
-.. code-block:: shell
-
-   export GLUE_DIR=/path/to/glue
-
-   python run_xlnet_classifier.py \
-    --task_name STS-B \
-    --do_train \
-    --do_eval \
-    --data_dir $GLUE_DIR/STS-B/ \
-    --max_seq_length 128 \
-    --train_batch_size 8 \
-    --gradient_accumulation_steps 1 \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --output_dir /tmp/mrpc_output/
-
-Our test ran on a few seeds with `the original implementation hyper-parameters <https://github.com/zihangdai/xlnet#1-sts-b-sentence-pair-relevance-regression-with-gpus>`__ gave evaluation results between 84% and 88%.
-
-**Distributed training**
-Here is an example using distributed training on 8 V100 GPUs to reach XXXX:
-
-.. code-block:: bash
-
-   python -m torch.distributed.launch --nproc_per_node 8 \
-    run_xlnet_classifier.py \
-    --task_name STS-B \
-    --do_train \
-    --do_eval \
-    --data_dir $GLUE_DIR/STS-B/ \
-    --max_seq_length 128 \
-    --train_batch_size 8 \
-    --gradient_accumulation_steps 1 \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --output_dir /tmp/mrpc_output/
-
-Training with these hyper-parameters gave us the following results:
-
-.. code-block:: bash
-
-     acc = 0.8823529411764706
-     acc_and_f1 = 0.901702786377709
-     eval_loss = 0.3418912578906332
-     f1 = 0.9210526315789473
-     global_step = 174
-     loss = 0.07231863956341798
-
-Here is an example on MNLI:
-
-.. code-block:: bash
-
-    python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py \
-        --bert_model bert-large-uncased-whole-word-masking \
-        --task_name mnli \
-        --do_train \
-        --do_eval \
-        --data_dir /datadrive/bert_data/glue_data//MNLI/ \
-        --max_seq_length 128 \
-        --train_batch_size 8 \
-        --learning_rate 2e-5 \
-        --num_train_epochs 3.0 \
-        --output_dir ../models/wwm-uncased-finetuned-mnli/ \
-        --overwrite_output_dir
-
-.. code-block:: bash
-
-   ***** Eval results *****
-     acc = 0.8679706601466992
-     eval_loss = 0.4911287787382479
-     global_step = 18408
-     loss = 0.04755385363816904
-
-   ***** Eval results *****
-     acc = 0.8747965825874695
-     eval_loss = 0.45516540421714036
-     global_step = 18408
-     loss = 0.04755385363816904
-
-This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model.
--- a/docs/source/imgs/transformers_logo_name.png
+++ b/docs/source/imgs/transformers_logo_name.png
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,9 +1,43 @@
-Pytorch-Transformers
+Transformers
 ================================================================================================================================================

-PyTorch-Transformers is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
+🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose architectures
+(BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation
+(NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.

-The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
+This is the documentation of our repository `transformers <https://github.com/huggingface/transformers>`__.
+
+Features
+---------------------------------------------------
+
+- As easy to use as pytorch-transformers
+- As powerful and concise as Keras
+- High performance on NLU and NLG tasks
+- Low barrier to entry for educators and practitioners
+
+State-of-the-art NLP for everyone:
+
+- Deep learning researchers
+- Hands-on practitioners
+- AI/ML/NLP teachers and educators
+
+Lower compute costs, smaller carbon footprint:
+
+- Researchers can share trained models instead of always retraining
+- Practitioners can reduce compute time and production costs
+- 8 architectures with over 30 pretrained models, some in more than 100 languages
+
+Choose the right framework for every part of a model's lifetime:
+
+- Train state-of-the-art models in 3 lines of code
+- Deep interoperability between TensorFlow 2.0 and PyTorch models
+- Move a single model between TF2.0/PyTorch frameworks at will
+- Seamlessly pick the right framework for training, evaluation, production
+
+Contents
+---------------------------------
+
+The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:

 1. `BERT <https://github.com/google-research/bert>`_ (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 2. `GPT <https://github.com/openai/finetune-transformer-lm>`_ (from OpenAI) released with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised>`_ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
@@ -12,7 +46,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
 5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
 7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
+8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.

 .. toctree::
    :maxdepth: 2
@@ -28,6 +62,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
    migration
    bertology
    torchscript
+    multilingual

 .. toctree::
    :maxdepth: 2
@@ -37,6 +72,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
    main_classes/model
    main_classes/tokenizer
    main_classes/optimizer_schedules
+    main_classes/processors

 .. toctree::
    :maxdepth: 2
@@ -51,3 +87,4 @@ The library currently contains PyTorch implementations, pre-trained model weight
    model_doc/xlnet
    model_doc/roberta
    model_doc/distilbert
+    model_doc/ctrl
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -0,0 +1,58 @@
+# Installation
+
+Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
+
+## With pip
+
+PyTorch Transformers can be installed using pip as follows:
+
+``` bash
+pip install transformers
+```
+
+## From source
+
+To install from source, clone the repository and install with:
+
+``` bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install [--editable] .
+```
+
+## Tests
+
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
+
+Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
+
+Run all the tests from the root of the cloned repository with the commands:
+
+``` bash
+python -m pytest -sv ./transformers/tests/
+python -m pytest -sv ./examples/
+```
+
+## OpenAI GPT original tokenization workflow
+
+If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`:
+
+``` bash
+pip install spacy ftfy==4.4.3
+python -m spacy download en
+```
+
+If you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
+
+## Note on model downloads (Continuous Integration or large-scale deployments)
+
+If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help.
+
+## Do you want to run a Transformer model on a mobile device?
+
+You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo.
+
+It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
+
+At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
+or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -1,71 +0,0 @@
-Installation
-================================================
-
-PyTorch-Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
-
-With pip
-^^^^^^^^
-
-PyTorch Transformers can be installed using pip as follows:
-
-.. code-block:: bash
-
-   pip install pytorch-transformers
-
-From source
-^^^^^^^^^^^
-
-To install from source, clone the repository and install with:
-
-.. code-block:: bash
-
-    git clone https://github.com/huggingface/pytorch-transformers.git
-    cd pytorch-transformers
-    pip install [--editable] .
-
-
-Tests
-^^^^^
-
-An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`_.
-
-Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
-
-Run all the tests from the root of the cloned repository with the commands:
-
-.. code-block:: bash
-
-    python -m pytest -sv ./pytorch_transformers/tests/
-    python -m pytest -sv ./examples/
-
-
-OpenAI GPT original tokenization workflow
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-If you want to reproduce the original tokenization process of the ``OpenAI GPT`` paper, you will need to install ``ftfy`` (use version 4.4.3 if you are using Python 2) and ``SpaCy`` :
-
-.. code-block:: bash
-
-   pip install spacy ftfy==4.4.3
-   python -m spacy download en
-
-If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
-
-
-Note on model downloads (Continuous Integration or large-scale deployments)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help.
-
-
-Do you want to run a Transformer model on a mobile device?
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-You should check out our `swift-coreml-transformers <https://github.com/huggingface/swift-coreml-transformers>`_ repo.
-
-It contains an example of a conversion script from a Pytorch trained Transformer model (here, ``GPT-2``) to a CoreML model that runs on iOS devices.
-
-It also contains an implementation of BERT for Question answering.
-
-At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
-or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@@ -6,5 +6,5 @@ The base class ``PretrainedConfig`` implements the common methods for loading/sa
 ``PretrainedConfig``
 ~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.PretrainedConfig
+.. autoclass:: transformers.PretrainedConfig
    :members:
--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@@ -11,5 +11,11 @@ The base class ``PreTrainedModel`` implements the common methods for loading/sav
 ``PreTrainedModel``
 ~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.PreTrainedModel
+.. autoclass:: transformers.PreTrainedModel
+    :members:
+
+``TFPreTrainedModel``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFPreTrainedModel
    :members:
--- a/docs/source/main_classes/optimizer_schedules.rst
+++ b/docs/source/main_classes/optimizer_schedules.rst
@@ -9,7 +9,7 @@ The ``.optimization`` module provides:
 ``AdamW``
 ~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.AdamW
+.. autoclass:: transformers.AdamW
    :members:

 Schedules
@@ -18,11 +18,11 @@ Schedules
 Learning Rate Schedules
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-.. autoclass:: pytorch_transformers.ConstantLRSchedule
+.. autoclass:: transformers.ConstantLRSchedule
    :members:


-.. autoclass:: pytorch_transformers.WarmupConstantSchedule
+.. autoclass:: transformers.WarmupConstantSchedule
    :members:

 .. image:: /imgs/warmup_constant_schedule.png
@@ -30,7 +30,7 @@ Learning Rate Schedules
    :alt:


-.. autoclass:: pytorch_transformers.WarmupCosineSchedule
+.. autoclass:: transformers.WarmupCosineSchedule
    :members:

 .. image:: /imgs/warmup_cosine_schedule.png
@@ -38,7 +38,7 @@ Learning Rate Schedules
    :alt:


-.. autoclass:: pytorch_transformers.WarmupCosineWithHardRestartsSchedule
+.. autoclass:: transformers.WarmupCosineWithHardRestartsSchedule
    :members:

 .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
@@ -47,7 +47,7 @@ Learning Rate Schedules



-.. autoclass:: pytorch_transformers.WarmupLinearSchedule
+.. autoclass:: transformers.WarmupLinearSchedule
    :members:

 .. image:: /imgs/warmup_linear_schedule.png
--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@@ -0,0 +1,58 @@
+Processors
+----------------------------------------------------
+
+This library includes processors for several traditional tasks. These processors can be used to process a dataset into
+examples that can be fed to a model.
+
+Processors
+~~~~~~~~~~~~~~~~~~~~~
+
+All processors follow the same architecture which is that of the
+:class:`~transformers.data.processors.utils.DataProcessor`. The processor returns a list
+of :class:`~transformers.data.processors.utils.InputExample`. These
+:class:`~transformers.data.processors.utils.InputExample` can be converted to
+:class:`~transformers.data.processors.utils.InputFeatures` in order to be fed to the model.
+
+.. autoclass:: transformers.data.processors.utils.DataProcessor
+    :members:
+
+
+.. autoclass:: transformers.data.processors.utils.InputExample
+    :members:
+
+
+.. autoclass:: transformers.data.processors.utils.InputFeatures
+    :members:
+
+
+GLUE
+~~~~~~~~~~~~~~~~~~~~~
+
+`General Language Understanding Evaluation (GLUE) <https://gluebenchmark.com/>`__ is a benchmark that evaluates
+the performance of models across a diverse set of existing NLU tasks. It was released together with the paper
+`GLUE: A multi-task benchmark and analysis platform for natural language understanding <https://openreview.net/pdf?id=rJ4km2R5t7>`__
+
+This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched),
+CoLA, SST2, STSB, QQP, QNLI, RTE and WNLI.
+
+Those processors are:
+    - :class:`~transformers.data.processors.utils.MrpcProcessor`
+    - :class:`~transformers.data.processors.utils.MnliProcessor`
+    - :class:`~transformers.data.processors.utils.MnliMismatchedProcessor`
+    - :class:`~transformers.data.processors.utils.Sst2Processor`
+    - :class:`~transformers.data.processors.utils.StsbProcessor`
+    - :class:`~transformers.data.processors.utils.QqpProcessor`
+    - :class:`~transformers.data.processors.utils.QnliProcessor`
+    - :class:`~transformers.data.processors.utils.RteProcessor`
+    - :class:`~transformers.data.processors.utils.WnliProcessor`
+
+Additionally, the following method  can be used to load values from a data file and convert them to a list of
+:class:`~transformers.data.processors.utils.InputExample`.
+
+.. automethod:: transformers.data.processors.glue.glue_convert_examples_to_features
+
+Example usage
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+An example using these processors is given in the
+`run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -12,5 +12,5 @@ The base class ``PreTrainedTokenizer`` implements the common methods for loading
 ``PreTrainedTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.PreTrainedTokenizer
+.. autoclass:: transformers.PreTrainedTokenizer
    :members:
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -1,17 +1,17 @@
 # Migrating from pytorch-pretrained-bert


-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers`
+Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`

 ### Models always output `tuples`

-The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.

-The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
+The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).

 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.

-Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model:
+Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:

 ```python
 # Let's load our model
@@ -20,11 +20,11 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 # If you used to have this line in pytorch-pretrained-bert:
 loss = model(input_ids, labels=labels)

-# Now just use this line in pytorch-transformers to extract the loss from the output tuple:
+# Now just use this line in transformers to extract the loss from the output tuple:
 outputs = model(input_ids, labels=labels)
 loss = outputs[0]

-# In pytorch-transformers you can also have access to the logits:
+# In transformers you can also have access to the logits:
 loss, logits = outputs[:2]

 # And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
@@ -96,7 +96,7 @@ for batch in train_data:
    loss.backward()
    optimizer.step()

-### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
+### In Transformers, optimizer and schedules are splitted and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
 scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
 ### and used like this:
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -11,19 +11,19 @@ Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will di
 ``AutoConfig``
 ~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.AutoConfig
+.. autoclass:: transformers.AutoConfig
    :members:


 ``AutoModel``
 ~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.AutoModel
+.. autoclass:: transformers.AutoModel
    :members:


 ``AutoTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.AutoTokenizer
+.. autoclass:: transformers.AutoTokenizer
    :members:
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -4,69 +4,125 @@ BERT
 ``BertConfig``
 ~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.BertConfig
+.. autoclass:: transformers.BertConfig
    :members:


 ``BertTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.BertTokenizer
+.. autoclass:: transformers.BertTokenizer
    :members:


 ``BertModel``
 ~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.BertModel
+.. autoclass:: transformers.BertModel
    :members:


 ``BertForPreTraining``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.BertForPreTraining
+.. autoclass:: transformers.BertForPreTraining
    :members:


 ``BertForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.BertForMaskedLM
+.. autoclass:: transformers.BertForMaskedLM
    :members:


 ``BertForNextSentencePrediction``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.BertForNextSentencePrediction
+.. autoclass:: transformers.BertForNextSentencePrediction
    :members:


 ``BertForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.BertForSequenceClassification
+.. autoclass:: transformers.BertForSequenceClassification
    :members:


 ``BertForMultipleChoice``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.BertForMultipleChoice
+.. autoclass:: transformers.BertForMultipleChoice
    :members:


 ``BertForTokenClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.BertForTokenClassification
+.. autoclass:: transformers.BertForTokenClassification
    :members:


 ``BertForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.BertForQuestionAnswering
+.. autoclass:: transformers.BertForQuestionAnswering
+    :members:
+
+
+``TFBertModel``
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBertModel
+    :members:
+
+
+``TFBertForPreTraining``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBertForPreTraining
+    :members:
+
+
+``TFBertForMaskedLM``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBertForMaskedLM
+    :members:
+
+
+``TFBertForNextSentencePrediction``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBertForNextSentencePrediction
+    :members:
+
+
+``TFBertForSequenceClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBertForSequenceClassification
+    :members:
+
+
+``TFBertForMultipleChoice``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBertForMultipleChoice
+    :members:
+
+
+``TFBertForTokenClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBertForTokenClassification
+    :members:
+
+
+``TFBertForQuestionAnswering``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBertForQuestionAnswering
    :members:

--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@@ -0,0 +1,44 @@
+CTRL
+----------------------------------------------------
+
+``CTRLConfig``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CTRLConfig
+    :members:
+
+
+``CTRLTokenizer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CTRLTokenizer
+    :members:
+
+
+``CTRLModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CTRLModel
+    :members:
+
+
+``CTRLLMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CTRLLMHeadModel
+    :members:
+
+
+``TFCTRLModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCTRLModel
+    :members:
+
+
+``TFCTRLLMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCTRLLMHeadModel
+    :members:
+
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -4,40 +4,67 @@ DistilBERT
 ``DistilBertConfig``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.DistilBertConfig
+.. autoclass:: transformers.DistilBertConfig
    :members:


 ``DistilBertTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.DistilBertTokenizer
+.. autoclass:: transformers.DistilBertTokenizer
    :members:


 ``DistilBertModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.DistilBertModel
+.. autoclass:: transformers.DistilBertModel
    :members:


 ``DistilBertForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.DistilBertForMaskedLM
+.. autoclass:: transformers.DistilBertForMaskedLM
    :members:


 ``DistilBertForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.DistilBertForSequenceClassification
+.. autoclass:: transformers.DistilBertForSequenceClassification
    :members:


 ``DistilBertForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.DistilBertForQuestionAnswering
+.. autoclass:: transformers.DistilBertForQuestionAnswering
+    :members:
+
+``TFDistilBertModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDistilBertModel
+    :members:
+
+
+``TFDistilBertForMaskedLM``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDistilBertForMaskedLM
+    :members:
+
+
+``TFDistilBertForSequenceClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDistilBertForSequenceClassification
+    :members:
+
+
+``TFDistilBertForQuestionAnswering``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDistilBertForQuestionAnswering
    :members:
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -4,33 +4,54 @@ OpenAI GPT
 ``OpenAIGPTConfig``
 ~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.OpenAIGPTConfig
+.. autoclass:: transformers.OpenAIGPTConfig
    :members:


 ``OpenAIGPTTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.OpenAIGPTTokenizer
+.. autoclass:: transformers.OpenAIGPTTokenizer
    :members:


 ``OpenAIGPTModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.OpenAIGPTModel
+.. autoclass:: transformers.OpenAIGPTModel
    :members:


 ``OpenAIGPTLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.OpenAIGPTLMHeadModel
+.. autoclass:: transformers.OpenAIGPTLMHeadModel
    :members:


 ``OpenAIGPTDoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.OpenAIGPTDoubleHeadsModel
+.. autoclass:: transformers.OpenAIGPTDoubleHeadsModel
+    :members:
+
+
+``TFOpenAIGPTModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFOpenAIGPTModel
+    :members:
+
+
+``TFOpenAIGPTLMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFOpenAIGPTLMHeadModel
+    :members:
+
+
+``TFOpenAIGPTDoubleHeadsModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFOpenAIGPTDoubleHeadsModel
    :members:
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -4,33 +4,54 @@ OpenAI GPT2
 ``GPT2Config``
 ~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.GPT2Config
+.. autoclass:: transformers.GPT2Config
    :members:


 ``GPT2Tokenizer``
 ~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.GPT2Tokenizer
+.. autoclass:: transformers.GPT2Tokenizer
    :members:


 ``GPT2Model``
 ~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.GPT2Model
+.. autoclass:: transformers.GPT2Model
    :members:


 ``GPT2LMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.GPT2LMHeadModel
+.. autoclass:: transformers.GPT2LMHeadModel
    :members:


 ``GPT2DoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.GPT2DoubleHeadsModel
+.. autoclass:: transformers.GPT2DoubleHeadsModel
+    :members:
+
+
+``TFGPT2Model``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFGPT2Model
+    :members:
+
+
+``TFGPT2LMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFGPT2LMHeadModel
+    :members:
+
+
+``TFGPT2DoubleHeadsModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFGPT2DoubleHeadsModel
    :members:
--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@@ -4,33 +4,54 @@ RoBERTa
 ``RobertaConfig``
 ~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.RobertaConfig
+.. autoclass:: transformers.RobertaConfig
    :members:


 ``RobertaTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.RobertaTokenizer
+.. autoclass:: transformers.RobertaTokenizer
    :members:


 ``RobertaModel``
 ~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.RobertaModel
+.. autoclass:: transformers.RobertaModel
    :members:


 ``RobertaForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.RobertaForMaskedLM
+.. autoclass:: transformers.RobertaForMaskedLM
    :members:


 ``RobertaForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.RobertaForSequenceClassification
+.. autoclass:: transformers.RobertaForSequenceClassification
+    :members:
+
+
+``TFRobertaModel``
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRobertaModel
+    :members:
+
+
+``TFRobertaForMaskedLM``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRobertaForMaskedLM
+    :members:
+
+
+``TFRobertaForSequenceClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRobertaForSequenceClassification
    :members:
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -5,26 +5,40 @@ Transformer XL
 ``TransfoXLConfig``
 ~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TransfoXLConfig
+.. autoclass:: transformers.TransfoXLConfig
    :members:


 ``TransfoXLTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TransfoXLTokenizer
+.. autoclass:: transformers.TransfoXLTokenizer
    :members:


 ``TransfoXLModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TransfoXLModel
+.. autoclass:: transformers.TransfoXLModel
    :members:


 ``TransfoXLLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TransfoXLLMHeadModel
+.. autoclass:: transformers.TransfoXLLMHeadModel
+    :members:
+
+
+``TFTransfoXLModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTransfoXLModel
+    :members:
+
+
+``TFTransfoXLLMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTransfoXLLMHeadModel
    :members:
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -4,38 +4,66 @@ XLM
 ``XLMConfig``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.XLMConfig
+.. autoclass:: transformers.XLMConfig
    :members:

 ``XLMTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.XLMTokenizer
+.. autoclass:: transformers.XLMTokenizer
    :members:

 ``XLMModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.XLMModel
+.. autoclass:: transformers.XLMModel
    :members:


 ``XLMWithLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.XLMWithLMHeadModel
+.. autoclass:: transformers.XLMWithLMHeadModel
    :members:


 ``XLMForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.XLMForSequenceClassification
+.. autoclass:: transformers.XLMForSequenceClassification
    :members:


 ``XLMForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.XLMForQuestionAnswering
+.. autoclass:: transformers.XLMForQuestionAnswering
+    :members:
+
+
+``TFXLMModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMModel
+    :members:
+
+
+``TFXLMWithLMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMWithLMHeadModel
+    :members:
+
+
+``TFXLMForSequenceClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMForSequenceClassification
+    :members:
+
+
+``TFXLMForQuestionAnsweringSimple``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMForQuestionAnsweringSimple
    :members:
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -4,40 +4,68 @@ XLNet
 ``XLNetConfig``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.XLNetConfig
+.. autoclass:: transformers.XLNetConfig
    :members:


 ``XLNetTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.XLNetTokenizer
+.. autoclass:: transformers.XLNetTokenizer
    :members:


 ``XLNetModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.XLNetModel
+.. autoclass:: transformers.XLNetModel
    :members:


 ``XLNetLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.XLNetLMHeadModel
+.. autoclass:: transformers.XLNetLMHeadModel
    :members:


 ``XLNetForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.XLNetForSequenceClassification
+.. autoclass:: transformers.XLNetForSequenceClassification
    :members:


 ``XLNetForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.XLNetForQuestionAnswering
+.. autoclass:: transformers.XLNetForQuestionAnswering
+    :members:
+
+
+``TFXLNetModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLNetModel
+    :members:
+
+
+``TFXLNetLMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLNetLMHeadModel
+    :members:
+
+
+``TFXLNetForSequenceClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLNetForSequenceClassification
+    :members:
+
+
+``TFXLNetForQuestionAnsweringSimple``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLNetForQuestionAnsweringSimple
    :members:
--- a/docs/source/multilingual.rst
+++ b/docs/source/multilingual.rst
@@ -0,0 +1,103 @@
+Multi-lingual models
+================================================
+
+Most of the models available in this library are mono-lingual models (English, Chinese and German). A few
+multi-lingual models are available and have a different mechanisms than mono-lingual models.
+This page details the usage of these models.
+
+The two models that currently support multiple languages are BERT and XLM.
+
+XLM
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+XLM has a total of 10 different checkpoints, only one of which is mono-lingual. The 9 remaining model checkpoints can
+be split in two categories: the checkpoints that make use of language embeddings, and those that don't
+
+XLM & Language Embeddings
+------------------------------------------------
+
+This section concerns the following checkpoints:
+
+- ``xlm-mlm-ende-1024`` (Masked language modeling, English-German)
+- ``xlm-mlm-enfr-1024`` (Masked language modeling, English-French)
+- ``xlm-mlm-enro-1024`` (Masked language modeling, English-Romanian)
+- ``xlm-mlm-xnli15-1024`` (Masked language modeling, XNLI languages)
+- ``xlm-mlm-tlm-xnli15-1024`` (Masked language modeling + Translation, XNLI languages)
+- ``xlm-clm-enfr-1024`` (Causal language modeling, English-French)
+- ``xlm-clm-ende-1024`` (Causal language modeling, English-German)
+
+These checkpoints require language embeddings that will specify the language used at inference time. These language
+embeddings are represented as a tensor that is of the same shape as the input ids passed to the model. The values in
+these tensors depend on the language used and are identifiable using the ``lang2id`` and ``id2lang`` attributes
+from the tokenizer.
+
+Here is an example using the ``xlm-clm-enfr-1024`` checkpoint (Causal language modeling, English-French):
+
+
+.. code-block::
+
+    import torch
+    from transformers import XLMTokenizer, XLMWithLMHeadModel
+
+    tokenizer = XLMTokenizer.from_pretrained("xlm-clm-1024-enfr")
+
+
+The different languages this model/tokenizer handles, as well as the ids of these languages are visible using the
+``lang2id`` attribute:
+
+.. code-block::
+
+    print(tokenizer.lang2id)  # {'en': 0, 'fr': 1}
+
+
+These ids should be used when passing a language parameter during a model pass. Let's define our inputs:
+
+.. code-block::
+
+    input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1
+
+
+We should now define the language embedding by using the previously defined language id. We want to create a tensor
+filled with the appropriate language ids, of the same size as input_ids. For english, the id is 0:
+
+.. code-block::
+
+    language_id = tokenizer.lang2id['en']  # 0
+    langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
+
+    # We reshape it to be of size (batch_size, sequence_length)
+    langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)
+
+
+You can then feed it all as input to your model:
+
+.. code-block::
+
+    outputs = model(input_ids, langs=langs)
+
+
+The example `run_generation.py <https://github.com/huggingface/transformers/blob/master/examples/run_generation.py>`__
+can generate text using the CLM checkpoints from XLM, using the language embeddings.
+
+XLM without Language Embeddings
+------------------------------------------------
+
+This section concerns the following checkpoints:
+
+- ``xlm-mlm-17-1280`` (Masked language modeling, 17 languages)
+- ``xlm-mlm-100-1280`` (Masked language modeling, 100 languages)
+
+These checkpoints do not require language embeddings at inference time. These models are used to have generic
+sentence representations, differently from previously-mentioned XLM checkpoints.
+
+
+BERT
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+BERT has two checkpoints that can be used for multi-lingual tasks:
+
+- ``bert-base-multilingual-uncased`` (Masked language modeling + Next sentence prediction, 102 languages)
+- ``bert-base-multilingual-cased`` (Masked language modeling + Next sentence prediction, 104 languages)
+
+These checkpoints do not require language embeddings at inference time. They should identify the language
+used in the context and infer accordingly.
--- a/docs/source/notebooks.rst
+++ b/docs/source/notebooks.rst
@@ -1,16 +1,16 @@
 Notebooks
 ================================================

-We include `three Jupyter Notebooks <https://github.com/huggingface/pytorch-transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
+We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.


 *
-  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
+  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.

 *
-  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
+  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.

 *
-  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
+  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.

 Please follow the instructions given in the notebooks to run and modify them.
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -44,15 +44,15 @@ Here is the full list of the currently provided pretrained models together with
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
 |                   |                                                            | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                                                             |
-|                   |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`__).   |
+|                   |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/transformers/tree/master/examples>`__).           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-cased-whole-word-masking-finetuned-squad``    | | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                                    |
 |                   |                                                            | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                               |
-|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)                   |
+|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-cased-finetuned-mrpc``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | The ``bert-base-cased`` model fine-tuned on MRPC                                                                                    |
-|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)                   |
+|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | OpenAI GPT English model                                                                                                            |
@@ -79,10 +79,10 @@ Here is the full list of the currently provided pretrained models together with
 |                   |                                                            | | XLM English model                                                                                                                   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
-|                   |                                                            | | XLM English-German Multi-language model                                                                                             |
+|                   |                                                            | | XLM English-German model trained on the concatenation of English and German wikipedia                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-enfr-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
-|                   |                                                            | | XLM English-French Multi-language model                                                                                             |
+|                   |                                                            | | XLM English-French model trained on the concatenation of English and French wikipedia                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-enro-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
 |                   |                                                            | | XLM English-Romanian Multi-language model                                                                                           |
@@ -93,11 +93,17 @@ Here is the full list of the currently provided pretrained models together with
 |                   | ``xlm-mlm-tlm-xnli15-1024``                                | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
 |                   |                                                            | | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.                       |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-clm-enfr-1024``                                      | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
-|                   |                                                            | | XLM English model trained with CLM (Causal Language Modeling)                                                                       |
+|                   | ``xlm-clm-enfr-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
+|                   |                                                            | | XLM English-French model trained with CLM (Causal Language Modeling) on the concatenation of English and French wikipedia           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-clm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
-|                   |                                                            | | XLM English-German Multi-language model trained with CLM (Causal Language Modeling)                                                 |
+|                   |                                                            | | XLM English-German model trained with CLM (Causal Language Modeling) on the concatenation of English and German wikipedia           |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-17-1280``                                        | | 16-layer, 1280-hidden, 16-heads                                                                                                     |
+|                   |                                                            | | XLM model trained with MLM (Masked Language Modeling) on 17 languages.                                                              |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-100-1280``                                       | | 16-layer, 1280-hidden, 16-heads                                                                                                     |
+|                   |                                                            | | XLM model trained with MLM (Masked Language Modeling) on 100 languages.                                                             |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | RoBERTa           | ``roberta-base``                                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
 |                   |                                                            | | RoBERTa using the BERT-base architecture                                                                                            |
@@ -113,11 +119,18 @@ Here is the full list of the currently provided pretrained models together with
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | DistilBERT        | ``distilbert-base-uncased``                                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint                                                   |
-|                   |                                                            | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__)                                                            |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilbert-base-uncased-distilled-squad``                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer.                 |
-|                   |                                                            | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__)                                                            |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``distilgpt2``                                             | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
+|                   |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| CTRL              | ``ctrl``                                                   | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters                                                                                    |
+|                   |                                                            | | Salesforce's Large-sized CTRL English model                                                                                         |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+

-.. <https://huggingface.co/pytorch-transformers/examples.html>`__
+.. <https://huggingface.co/transformers/examples.html>`__
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -2,7 +2,7 @@

 ## Philosophy

-PyTorch-Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
+Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.

 The library was designed with two strong goals in mind:

@@ -19,12 +19,12 @@ The library was designed with two strong goals in mind:

 A few other goals:

- expose the models internals as consistently as possible:
+- expose the models' internals as consistently as possible:

  - we give access, using a single API to the full hidden-states and attention weights,
  - tokenizer and base model's API are standardized to easily switch between models.

- incorporate a subjective selection of promising tools for fine-tuning/investiguating these models:
+- incorporate a subjective selection of promising tools for fine-tuning/investigating these models:

  - a simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning,
  - simple ways to mask and prune transformer heads.
@@ -33,13 +33,13 @@ A few other goals:

 The library is build around three type of classes for each models:

- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 6 models architectures currently provided in the library, e.g. `BertModel`
+- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 8 models architectures currently provided in the library, e.g. `BertModel`
 - **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig`. You don't always need to instantiate these your-self, in particular if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model)
 - **tokenizer classes** which store the vocabulary for each model and provide methods for encoding/decoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer`

 All these classes can be instantiated from pretrained instances and saved locally using two methods:

- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/pytorch-transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
+- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
 - `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.

 We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts:
@@ -51,7 +51,7 @@ We'll finish this quickstart tour by going through a few simple quick-start exam

 Here are two examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.

-See full API reference for examples for each model classe.
+See full API reference for examples for each model class.

 ### BERT example

@@ -59,7 +59,7 @@ Let's start by preparing a tokenized input (a list of token embeddings indices t

 ```python
 import torch
-from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
+from transformers import BertTokenizer, BertModel, BertForMaskedLM

 # OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
 import logging
@@ -93,8 +93,8 @@ Let's see how we can use `BertModel` to encode our inputs in hidden-states:
 # Load pre-trained model (weights)
 model = BertModel.from_pretrained('bert-base-uncased')

-# Set the model in evaluation mode to desactivate the DropOut modules
-# This is IMPORTANT to have reproductible results during evaluation!
+# Set the model in evaluation mode to deactivate the DropOut modules
+# This is IMPORTANT to have reproducible results during evaluation!
 model.eval()

 # If you have a GPU, put everything on cuda
@@ -106,7 +106,7 @@ model.to('cuda')
 with torch.no_grad():
    # See the models docstrings for the detail of the inputs
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
-    # PyTorch-Transformers models always output tuples.
+    # Transformers models always output tuples.
    # See the models docstrings for the detail of all the outputs
    # In our case, the first element is the hidden state of the last layer of the Bert model
    encoded_layers = outputs[0]
@@ -145,7 +145,7 @@ First let's prepare a tokenized input from our text string using `GPT2Tokenizer`

 ```python
 import torch
-from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
+from transformers import GPT2Tokenizer, GPT2LMHeadModel

 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
@@ -168,8 +168,8 @@ Let's see how to use `GPT2LMHeadModel` to generate the next token following our
 # Load pre-trained model (weights)
 model = GPT2LMHeadModel.from_pretrained('gpt2')

-# Set the model in evaluation mode to desactivate the DropOut modules
-# This is IMPORTANT to have reproductible results during evaluation!
+# Set the model in evaluation mode to deactivate the DropOut modules
+# This is IMPORTANT to have reproducible results during evaluation!
 model.eval()

 # If you have a GPU, put everything on cuda
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -45,7 +45,7 @@ where
    * ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
    * ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )

-  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
+  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).

 *
  ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
@@ -122,7 +122,7 @@ Here is the recommended way of saving the model, configuration and vocabulary to

 .. code-block:: python

-   from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+   from transformers import WEIGHTS_NAME, CONFIG_NAME

   output_dir = "./models/"

--- a/docs/source/torchscript.rst
+++ b/docs/source/torchscript.rst
@@ -12,7 +12,7 @@ According to Pytorch's documentation: "TorchScript is a way to create serializab
 Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
 their model to be re-used in other programs, such as efficiency-oriented C++ programs.

-We have provided an interface that allows the export of `pytorch-transformers` models to TorchScript so that they can
+We have provided an interface that allows the export of `transformers` models to TorchScript so that they can
 be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that
 they can be exported, and what to be mindful of when using these models with TorchScript.

@@ -74,7 +74,7 @@ according to a ``BertConfig`` class and then saved to disk under the filename ``

 .. code-block:: python

-    from pytorch_transformers import BertModel, BertTokenizer, BertConfig
+    from transformers import BertModel, BertTokenizer, BertConfig
    import torch

    enc = BertTokenizer.from_pretrained("bert-base-uncased")
--- a/examples/README.md
+++ b/examples/README.md
@@ -0,0 +1,392 @@
+# Examples
+
+In this section a few examples are put together. All of these examples work for several models, making use of the very
+similar API between the different models.
+
+| Section                    | Description                                                                                                                                                |
+|----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| [Language Model fine-tuning](#language-model-fine-tuning) | Fine-tuning the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. |
+| [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.                                         |
+| [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision.                              |
+| [SQuAD](#squad) | Using BERT for question answering, examples with distributed training.                                                                                  |
+| [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
+
+## Language model fine-tuning
+
+Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_lm_finetuning.py).
+
+Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT 
+to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa 
+are fine-tuned using a masked language modeling (MLM) loss.
+
+Before running the following example, you should get a file that contains text on which the language model will be
+fine-tuned. A good example of such text is the [WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
+
+We will refer to two different files: `$TRAIN_FILE`, which contains text for training, and `$TEST_FILE`, which contains
+text that will be used for evaluation.
+
+### GPT-2/GPT and causal language modeling
+
+The following example fine-tunes GPT-2 on WikiText-2. We're using the raw WikiText-2 (no tokens were replaced before
+the tokenization). The loss here is that of causal language modeling.
+
+```bash
+export TRAIN_FILE=/path/to/dataset/wiki.train.raw
+export TEST_FILE=/path/to/dataset/wiki.test.raw
+
+python run_lm_finetuning.py \
+    --output_dir=output \
+    --model_type=gpt2 \
+    --model_name_or_path=gpt2 \
+    --do_train \
+    --train_data_file=$TRAIN_FILE \
+    --do_eval \
+    --eval_data_file=$TEST_FILE
+```
+
+This takes about half an hour to train on a single K80 GPU and about one minute for the evaluation to run. It reaches
+a score of ~20 perplexity once fine-tuned on the dataset.
+
+### RoBERTa/BERT and masked language modeling
+
+The following example fine-tunes RoBERTa on WikiText-2. Here too, we're using the raw WikiText-2. The loss is different
+as BERT/RoBERTa have a bidirectional mechanism; we're therefore using the same loss that was used during their
+pre-training: masked language modeling. 
+
+In accordance to the RoBERTa paper, we use dynamic masking rather than static masking. The model may, therefore, converge
+slightly slower (over-fitting takes more epochs).
+
+We use the `--mlm` flag so that the script may change its loss function.
+
+```bash
+export TRAIN_FILE=/path/to/dataset/wiki.train.raw
+export TEST_FILE=/path/to/dataset/wiki.test.raw
+
+python run_lm_finetuning.py \
+    --output_dir=output \
+    --model_type=roberta \
+    --model_name_or_path=roberta-base \
+    --do_train \
+    --train_data_file=$TRAIN_FILE \
+    --do_eval \
+    --eval_data_file=$TEST_FILE \
+    --mlm
+```
+
+## Language generation
+
+Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py).
+
+Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
+A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
+can try out the different models available in the library.
+
+Example usage:
+
+```bash
+python run_generation.py \
+    --model_type=gpt2 \
+    --model_name_or_path=gpt2
+```
+
+## GLUE
+
+Based on the script [`run_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py).
+
+Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding 
+Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa. 
+
+GLUE is made up of a total of 9 different tasks. We get the following results on the dev set of the benchmark with an
+uncased  BERT base model (the checkpoint `bert-base-uncased`). All experiments ran on 8 V100 GPUs with a total train
+batch size of 24. Some of these tasks have a small dataset and training can lead to high variance in the results
+between different runs. We report the median on 5 runs (with different seeds) for each of the metrics.
+
+| Task  | Metric                       | Result      |
+|-------|------------------------------|-------------|
+| CoLA  | Matthew's corr               | 48.87       |
+| SST-2 | Accuracy                     | 91.74       |
+| MRPC  | F1/Accuracy                  | 90.70/86.27 |
+| STS-B | Person/Spearman corr.        | 91.39/91.04 |
+| QQP   | Accuracy/F1                  | 90.79/87.66 |
+| MNLI  | Matched acc./Mismatched acc. | 83.70/84.83 |
+| QNLI  | Accuracy                     | 89.31       |
+| RTE   | Accuracy                     | 71.43       |
+| WNLI  | Accuracy                     | 43.66       |
+
+Some of these results are significantly different from the ones reported on the test set
+of GLUE benchmark on the website. For QQP and WNLI, please refer to [FAQ #12](https://gluebenchmark.com/faq) on the webite.
+
+Before running anyone of these GLUE tasks you should download the
+[GLUE data](https://gluebenchmark.com/tasks) by running
+[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+and unpack it to some directory `$GLUE_DIR`.
+
+```bash
+export GLUE_DIR=/path/to/glue
+export TASK_NAME=MRPC
+
+python run_glue.py \
+  --model_type bert \
+  --model_name_or_path bert-base-cased \
+  --task_name $TASK_NAME \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir $GLUE_DIR/$TASK_NAME \
+  --max_seq_length 128 \
+  --per_gpu_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3.0 \
+  --output_dir /tmp/$TASK_NAME/
+```
+
+where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
+
+The dev set results will be present within the text file `eval_results.txt` in the specified output_dir. 
+In case of MNLI, since there are two separate dev sets (matched and mismatched), there will be a separate 
+output folder called `/tmp/MNLI-MM/` in addition to `/tmp/MNLI/`.
+
+The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI, 
+CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being 
+said, there shouldn’t be any issues in running half-precision training with the remaining GLUE tasks as well, 
+since the data processor for each task inherits from the base class DataProcessor.
+
+### MRPC
+
+#### Fine-tuning example
+
+The following examples fine-tune BERT on the Microsoft Research Paraphrase Corpus (MRPC) corpus and runs in less 
+than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
+
+Before running anyone of these GLUE tasks you should download the
+[GLUE data](https://gluebenchmark.com/tasks) by running
+[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+and unpack it to some directory `$GLUE_DIR`.
+
+```bash
+export GLUE_DIR=/path/to/glue
+
+python run_glue.py \
+  --model_type bert \
+  --model_name_or_path bert-base-cased \
+  --task_name MRPC \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir $GLUE_DIR/MRPC/ \
+  --max_seq_length 128 \
+  --per_gpu_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3.0 \
+  --output_dir /tmp/mrpc_output/
+```
+
+Our test ran on a few seeds with [the original implementation hyper-
+parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation 
+results between 84% and 88%.
+
+#### Using Apex and mixed-precision
+
+Using Apex and 16 bit precision, the fine-tuning on MRPC only takes 27 seconds. First install 
+[apex](https://github.com/NVIDIA/apex), then run the following example:
+
+```bash
+export GLUE_DIR=/path/to/glue
+
+python run_glue.py \
+  --model_type bert \
+  --model_name_or_path bert-base-cased \
+  --task_name MRPC \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir $GLUE_DIR/MRPC/ \
+  --max_seq_length 128 \
+  --per_gpu_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3.0 \
+  --output_dir /tmp/mrpc_output/ \
+  --fp16
+```
+
+#### Distributed training
+
+Here is an example using distributed training on 8 V100 GPUs. The model used is the BERT whole-word-masking and it
+reaches F1 > 92 on MRPC.
+
+```bash
+export GLUE_DIR=/path/to/glue
+
+python -m torch.distributed.launch \
+    --nproc_per_node 8 run_glue.py \
+    --model_type bert \
+    --model_name_or_path bert-base-cased \
+    --task_name MRPC \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $GLUE_DIR/MRPC/ \
+    --max_seq_length 128 \
+    --per_gpu_train_batch_size 8 \
+    --learning_rate 2e-5 \
+    --num_train_epochs 3.0 \
+    --output_dir /tmp/mrpc_output/
+```
+
+Training with these hyper-parameters gave us the following results:
+
+```bash
+acc = 0.8823529411764706
+acc_and_f1 = 0.901702786377709
+eval_loss = 0.3418912578906332
+f1 = 0.9210526315789473
+global_step = 174
+loss = 0.07231863956341798
+```
+
+### MNLI
+
+The following example uses the BERT-large, uncased, whole-word-masking model and fine-tunes it on the MNLI task.
+
+```bash
+export GLUE_DIR=/path/to/glue
+
+python -m torch.distributed.launch \
+    --nproc_per_node 8 run_glue.py \
+    --model_type bert \
+    --model_name_or_path bert-base-cased \
+    --task_name mnli \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $GLUE_DIR/MNLI/ \
+    --max_seq_length 128 \
+    --per_gpu_train_batch_size 8 \
+    --learning_rate 2e-5 \
+    --num_train_epochs 3.0 \
+    --output_dir output_dir \
+```
+
+The results  are the following:
+
+```bash
+***** Eval results *****
+  acc = 0.8679706601466992
+  eval_loss = 0.4911287787382479
+  global_step = 18408
+  loss = 0.04755385363816904
+
+***** Eval results *****
+  acc = 0.8747965825874695
+  eval_loss = 0.45516540421714036
+  global_step = 18408
+  loss = 0.04755385363816904
+```
+
+## Multiple Choice
+
+Based on the script [`run_multiple_choice.py`]().
+
+#### Fine-tuning on SWAG
+Download [swag](https://github.com/rowanz/swagaf/tree/master/data) data
+
+```bash
+#training on 4 tesla V100(16GB) GPUS
+export SWAG_DIR=/path/to/swag_data_dir
+python ./examples/run_multiple_choice.py \
+--model_type roberta \
+--task_name swag \
+--model_name_or_path roberta-base \
+--do_train \
+--do_eval \
+--do_lower_case \
+--data_dir $SWAG_DIR \
+--learning_rate 5e-5 \
+--num_train_epochs 3 \
+--max_seq_length 80 \
+--output_dir models_bert/swag_base \
+--per_gpu_eval_batch_size=16 \
+--per_gpu_train_batch_size=16 \
+--gradient_accumulation_steps 2 \
+--overwrite_output
+```
+Training with the defined hyper-parameters yields the following results:
+```
+***** Eval results *****
+eval_acc = 0.8338998300509847
+eval_loss = 0.44457291918821606
+```
+
+## SQuAD
+
+Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py).
+
+#### Fine-tuning on SQuAD
+
+This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) 
+on a single tesla V100 16GB. The data for SQuAD can be downloaded with the following links and should be saved in a 
+$SQUAD_DIR directory.
+
+* [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+* [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+* [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
+
+```bash
+export SQUAD_DIR=/path/to/SQUAD
+
+python run_squad.py \
+  --model_type bert \
+  --model_name_or_path bert-base-cased \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --train_file $SQUAD_DIR/train-v1.1.json \
+  --predict_file $SQUAD_DIR/dev-v1.1.json \
+  --per_gpu_train_batch_size 12 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2.0 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --output_dir /tmp/debug_squad/
+```
+
+Training with the previously defined hyper-parameters yields the following results:
+
+```bash
+f1 = 88.52
+exact_match = 81.22
+```
+
+#### Distributed training
+
+
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \
+    --model_type bert \
+    --model_name_or_path bert-base-cased \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --train_file $SQUAD_DIR/train-v1.1.json \
+    --predict_file $SQUAD_DIR/dev-v1.1.json \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir ../models/wwm_uncased_finetuned_squad/ \
+    --per_gpu_train_batch_size 24 \
+    --gradient_accumulation_steps 12
+```
+
+Training with the previously defined hyper-parameters yields the following results:
+
+```bash
+f1 = 93.15
+exact_match = 86.91
+```
+
+This fine-tuneds model is available as a checkpoint under the reference
+`bert-large-uncased-whole-word-masking-finetuned-squad`.
+
--- a/examples/contrib/README.md
+++ b/examples/contrib/README.md
@@ -0,0 +1,5 @@
+# Community contributed examples
+
+This folder contains examples which are not actively maintained (mostly contributed by the community).
+
+Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working.
--- a/examples/single_model_scripts/run_openai_gpt.py
+++ b/examples/single_model_scripts/run_openai_gpt.py
@@ -39,7 +39,7 @@ import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)

-from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
+from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
                                     AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
                                     WarmupLinearSchedule)

@@ -153,9 +153,11 @@ def main():
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
-    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
-    special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
-    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
+    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
+    tokenizer.add_tokens(special_tokens)
+    special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
+    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name)
+    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    # Load and encode the datasets
@@ -221,7 +223,7 @@ def main():
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
-                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
+                losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                scheduler.step()
@@ -258,7 +260,7 @@ def main():
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
-               _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids, lm_labels, mc_labels)
+               _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to('cpu').numpy()
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -0,0 +1,673 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner.
+   Finetuning the library models for multiple choice on SWAG (Bert).
+"""
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import csv
+import os
+import random
+import sys
+import glob
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+from tensorboardX import SummaryWriter
+
+from transformers import (WEIGHTS_NAME, BertConfig,
+                                  BertForMultipleChoice, BertTokenizer)
+
+from transformers import AdamW, WarmupLinearSchedule
+
+logger = logging.getLogger(__name__)
+
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
+                  for conf in [BertConfig]), ())
+
+MODEL_CLASSES = {
+    'bert': (BertConfig, BertForMultipleChoice, BertTokenizer),
+}
+
+class SwagExample(object):
+    """A single training/test example for the SWAG dataset."""
+    def __init__(self,
+                 swag_id,
+                 context_sentence,
+                 start_ending,
+                 ending_0,
+                 ending_1,
+                 ending_2,
+                 ending_3,
+                 label = None):
+        self.swag_id = swag_id
+        self.context_sentence = context_sentence
+        self.start_ending = start_ending
+        self.endings = [
+            ending_0,
+            ending_1,
+            ending_2,
+            ending_3,
+        ]
+        self.label = label
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        l = [
+            "swag_id: {}".format(self.swag_id),
+            "context_sentence: {}".format(self.context_sentence),
+            "start_ending: {}".format(self.start_ending),
+            "ending_0: {}".format(self.endings[0]),
+            "ending_1: {}".format(self.endings[1]),
+            "ending_2: {}".format(self.endings[2]),
+            "ending_3: {}".format(self.endings[3]),
+        ]
+
+        if self.label is not None:
+            l.append("label: {}".format(self.label))
+
+        return ", ".join(l)
+
+class InputFeatures(object):
+    def __init__(self,
+                 example_id,
+                 choices_features,
+                 label
+
+    ):
+        self.example_id = example_id
+        self.choices_features = [
+            {
+                'input_ids': input_ids,
+                'input_mask': input_mask,
+                'segment_ids': segment_ids
+            }
+            for _, input_ids, input_mask, segment_ids in choices_features
+        ]
+        self.label = label
+
+def read_swag_examples(input_file, is_training=True):
+    with open(input_file, 'r', encoding='utf-8') as f:
+        reader = csv.reader(f)
+        lines = []
+        for line in reader:
+            if sys.version_info[0] == 2:
+                line = list(unicode(cell, 'utf-8') for cell in line)
+            lines.append(line)
+
+    if is_training and lines[0][-1] != 'label':
+        raise ValueError(
+            "For training, the input file must contain a label column."
+        )
+
+    examples = [
+        SwagExample(
+            swag_id = line[2],
+            context_sentence = line[4],
+            start_ending = line[5], # in the swag dataset, the
+                                         # common beginning of each
+                                         # choice is stored in "sent2".
+            ending_0 = line[7],
+            ending_1 = line[8],
+            ending_2 = line[9],
+            ending_3 = line[10],
+            label = int(line[11]) if is_training else None
+        ) for line in lines[1:] # we skip the line with the column names
+    ]
+
+    return examples
+
+def convert_examples_to_features(examples, tokenizer, max_seq_length,
+                                 is_training):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    # Swag is a multiple choice task. To perform this task using Bert,
+    # we will use the formatting proposed in "Improving Language
+    # Understanding by Generative Pre-Training" and suggested by
+    # @jacobdevlin-google in this issue
+    # https://github.com/google-research/bert/issues/38.
+    #
+    # Each choice will correspond to a sample on which we run the
+    # inference. For a given Swag example, we will create the 4
+    # following inputs:
+    # - [CLS] context [SEP] choice_1 [SEP]
+    # - [CLS] context [SEP] choice_2 [SEP]
+    # - [CLS] context [SEP] choice_3 [SEP]
+    # - [CLS] context [SEP] choice_4 [SEP]
+    # The model will output a single value for each input. To get the
+    # final decision of the model, we will run a softmax over these 4
+    # outputs.
+    features = []
+    for example_index, example in tqdm(enumerate(examples)):
+        context_tokens = tokenizer.tokenize(example.context_sentence)
+        start_ending_tokens = tokenizer.tokenize(example.start_ending)
+
+        choices_features = []
+        for ending_index, ending in enumerate(example.endings):
+            # We create a copy of the context tokens in order to be
+            # able to shrink it according to ending_tokens
+            context_tokens_choice = context_tokens[:]
+            ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
+            # Modifies `context_tokens_choice` and `ending_tokens` in
+            # place so that the total length is less than the
+            # specified length.  Account for [CLS], [SEP], [SEP] with
+            # "- 3"
+            _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
+
+            tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
+            segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding = [0] * (max_seq_length - len(input_ids))
+            input_ids += padding
+            input_mask += padding
+            segment_ids += padding
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            choices_features.append((tokens, input_ids, input_mask, segment_ids))
+
+        label = example.label
+        if example_index < 5:
+            logger.info("*** Example ***")
+            logger.info("swag_id: {}".format(example.swag_id))
+            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
+                logger.info("choice: {}".format(choice_idx))
+                logger.info("tokens: {}".format(' '.join(tokens)))
+                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
+                logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
+                logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
+            if is_training:
+                logger.info("label: {}".format(label))
+
+        features.append(
+            InputFeatures(
+                example_id = example.swag_id,
+                choices_features = choices_features,
+                label = label
+            )
+        )
+
+    return features
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+def accuracy(out, labels):
+    outputs = np.argmax(out, axis=1)
+    return np.sum(outputs == labels)
+
+def select_field(features, field):
+    return [
+        [
+            choice[field]
+            for choice in feature.choices_features
+        ]
+        for feature in features
+    ]
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Load data features from cache or dataset file
+    input_file = args.predict_file if evaluate else args.train_file
+    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
+        'dev' if evaluate else 'train',
+        list(filter(None, args.model_name_or_path.split('/'))).pop(),
+        str(args.max_seq_length)))
+    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", input_file)
+        examples = read_swag_examples(input_file)
+        features = convert_examples_to_features(
+            examples, tokenizer, args.max_seq_length, not evaluate)
+
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long)
+    all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long)
+    all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long)
+    all_label = torch.tensor([f.label for f in features], dtype=torch.long)
+
+    if evaluate:
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                all_label)
+    else:
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                all_label)
+
+    if output_examples:
+        return dataset, examples, features
+    return dataset
+def train(args, train_dataset, model, tokenizer):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {'input_ids':       batch[0],
+                      'attention_mask':  batch[1],
+                      #'token_type_ids':  None if args.model_type == 'xlm' else batch[2],
+                      'token_type_ids': batch[2],
+                      'labels':         batch[3]}
+            # if args.model_type in ['xlnet', 'xlm']:
+            #     inputs.update({'cls_index': batch[5],
+            #                    'p_mask':       batch[6]})
+            outputs = model(**inputs)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+
+            if args.n_gpu > 1:
+                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+            else:
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    tokenizer.save_vocabulary(output_dir)
+                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+def evaluate(args, model, tokenizer, prefix=""):
+    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
+
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(args.output_dir)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    # Note that DistributedSampler samples randomly
+    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
+    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # Eval!
+    logger.info("***** Running evaluation {} *****".format(prefix))
+    logger.info("  Num examples = %d", len(dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+
+
+    eval_loss, eval_accuracy = 0, 0
+    nb_eval_steps, nb_eval_examples = 0, 0
+
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        model.eval()
+        batch = tuple(t.to(args.device) for t in batch)
+        with torch.no_grad():
+            inputs = {'input_ids':      batch[0],
+                      'attention_mask': batch[1],
+                      # 'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+                      'token_type_ids': batch[2],
+                      'labels':         batch[3]}
+
+            # if args.model_type in ['xlnet', 'xlm']:
+            #     inputs.update({'cls_index': batch[4],
+            #                    'p_mask':    batch[5]})
+            outputs = model(**inputs)
+            tmp_eval_loss, logits = outputs[:2]
+            eval_loss += tmp_eval_loss.mean().item()
+
+        logits = logits.detach().cpu().numpy()
+        label_ids = inputs['labels'].to('cpu').numpy()
+        tmp_eval_accuracy = accuracy(logits, label_ids)
+        eval_accuracy += tmp_eval_accuracy
+
+        nb_eval_steps += 1
+        nb_eval_examples += inputs['input_ids'].size(0)
+
+    eval_loss = eval_loss / nb_eval_steps
+    eval_accuracy = eval_accuracy / nb_eval_examples
+    result = {'eval_loss': eval_loss,
+              'eval_accuracy': eval_accuracy}
+
+    output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+    with open(output_eval_file, "w") as writer:
+        logger.info("***** Eval results *****")
+        for key in sorted(result.keys()):
+            logger.info("%s = %s", key, str(result[key]))
+            writer.write("%s = %s\n" % (key, str(result[key])))
+
+    return result
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--train_file", default=None, type=str, required=True,
+                        help="SWAG csv for training. E.g., train.csv")
+    parser.add_argument("--predict_file", default=None, type=str, required=True,
+                        help="SWAG csv for predictions. E.g., val.csv or test.csv")
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model checkpoints and predictions will be written.")
+
+    ## Other parameters
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--max_seq_length", default=384, type=int,
+                        help="The maximum total input sequence length after tokenization. Sequences "
+                             "longer than this will be truncated, and sequences shorter than this will be padded.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--evaluate_during_training", action='store_true',
+                        help="Rul evaluation during training at each logging step.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
+
+    parser.add_argument('--logging_steps', type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument('--save_steps', type=int, default=50,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action='store_true',
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument('--overwrite_output_dir', action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--fp16', action='store_true',
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    parser.add_argument('--fp16_opt_level', type=str, default='O1',
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html")
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend='nccl')
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+
+    # Set seed
+    set_seed(args)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    model.to(args.device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+
+    # Save the trained model and the tokenizer
+    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+
+
+    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        if args.do_train:
+            checkpoints = [args.output_dir]
+        else:
+            # if do_train is False and do_eval is true, load model directly from pretrained.
+            checkpoints = [args.model_name_or_path]
+
+        if args.eval_all_checkpoints:
+            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+
+        for checkpoint in checkpoints:
+            # Reload the model
+            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            model = model_class.from_pretrained(checkpoint)
+            tokenizer = tokenizer_class.from_pretrained(checkpoint)
+            model.to(args.device)
+
+            # Evaluate
+            result = evaluate(args, model, tokenizer, prefix=global_step)
+
+            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            results.update(result)
+
+    logger.info("Results: {}".format(results))
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/single_model_scripts/run_transfo_xl.py
+++ b/examples/single_model_scripts/run_transfo_xl.py
@@ -28,7 +28,7 @@ import math

 import torch

-from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
+from transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer

 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
@@ -113,7 +113,7 @@ def main():
        with torch.no_grad():
            mems = None
            for idx, (data, target, seq_len) in enumerate(eval_iter):
-                ret = model(data, target, mems)
+                ret = model(data, lm_labels=target, mems=mems)
                loss, _, mems = ret
                loss = loss.mean()
                total_loss += seq_len * loss.item()
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -1,20 +1,40 @@
-# DistilBERT
+# Distil*

-This folder contains the original code used to train DistilBERT as well as examples showcasing how to use DistilBERT.
+This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT and DistilGPT2.

-## What is DistilBERT
+**2019, October 3rd - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2.

-DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving over 95% of Bert's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
+**2019, September 19th - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 97% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!

-For more information on DistilBERT, please refer to our [detailed blog post](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5
-).
+## What is Distil*
+
+Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
+
+We have applied the same method to GPT2 and release the weights of the compressed model. On the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 15.0 compared to 18.5 for DistilGPT2 (after fine-tuning on the train set).
+
+For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108). The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances.
+
+Here are the results on the dev sets of GLUE:
+
+| Model      | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP  | RTE  | SST-2| STS-B| WNLI |
+| :---:      |    :---:    | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:|
+| BERT-base  |  **77.6**   | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 |
+| DistilBERT |  **76.8**   | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
+
+## Setup
+
+This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`. 
+
+**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/transformers/issues/1179) for more details.

 ## How to use DistilBERT

-PyTorch-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
+Transformers includes two pre-trained Distil* models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):

 - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.2 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
+- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
+- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset and . The model has 6 layers, 768 dimension and 12 heads, totalizing 82M (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
+- and more to come! 🤗🤗🤗

 Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.

@@ -27,9 +47,11 @@ outputs = model(input_ids)
 last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 ```

-## How to train DistilBERT
+Similarly, using DistilGPT2 simply consists in calling the GPT2 classes from a different pretrained checkpoint: `model = GPT2Model.from_pretrained('distilgpt2')`.

-In the following, we will explain how you can train your own compressed model.
+## How to train Distil*
+
+In the following, we will explain how you can train DistilBERT.

 ### A. Preparing the data

@@ -42,7 +64,8 @@ First, we will binarize the data, i.e. tokenize the data and convert each token
 ```bash
 python scripts/binarized_data.py \
    --file_path data/dump.txt \
-    --bert_tokenizer bert-base-uncased \
+    --tokenizer_type bert \
+    --tokenizer_name bert-base-uncased \
    --dump_file data/binarized_text
 ```

@@ -51,7 +74,8 @@ Our implementation of masked language modeling loss follows [XLM](https://github
 ```bash
 python scripts/token_counts.py \
    --data_file data/binarized_text.bert-base-uncased.pickle \
-    --token_counts_dump data/token_counts.bert-base-uncased.pickle
+    --token_counts_dump data/token_counts.bert-base-uncased.pickle \
+    --vocab_size 30522
 ```

 ### B. Training
@@ -60,6 +84,12 @@ Training with distillation is really simple once you have pre-processed the data

 ```bash
 python train.py \
+    --student_type distilbert \
+    --student_config training_configs/distilbert-base-uncased.json \
+    --teacher_type bert \
+    --teacher_name bert-base-uncased \
+    --alpha_ce 5.0 --alpha_mlm 2.0 --alpha_cos 1.0 --mlm \
+    --freeze_pos_embs \
    --dump_path serialization_dir/my_first_training \
    --data_file data/binarized_text.bert-base-uncased.pickle \
    --token_counts data/token_counts.bert-base-uncased.pickle \
@@ -68,7 +98,7 @@ python train.py \

 By default, this will launch a training on a single GPU (even if more are available on the cluster). Other parameters are available in the command line, please look in `train.py` or run `python train.py --help` to list them.

-We highly encourage you to use distributed training for training DistilBert as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
+We highly encourage you to use distributed training for training DistilBERT as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:

 ```bash
 export NODE_RANK=0
@@ -90,11 +120,17 @@ python -m torch.distributed.launch \
    train.py \
        --force \
        --n_gpu $WORLD_SIZE \
+        --student_type distilbert \
+        --student_config training_configs/distilbert-base-uncased.json \
+        --teacher_type bert \
+        --teacher_name bert-base-uncased \
+        --alpha_ce 0.33 --alpha_mlm 0.33 --alpha_cos 0.33 --mlm \
+        --freeze_pos_embs \
+        --dump_path serialization_dir/my_first_training \
        --data_file data/binarized_text.bert-base-uncased.pickle \
-        --token_counts data/token_counts.bert-base-uncased.pickle \
-        --dump_path serialization_dir/my_first_distillation
+        --token_counts data/token_counts.bert-base-uncased.pickle
 ```

-**Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract_for_distil.py` to create a valid initialization checkpoint and use `--from_pretrained_weights` and `--from_pretrained_config` arguments to use this initialization for the distilled training!
+**Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract.py` and `scripts/extract_distilbert.py` to create a valid initialization checkpoint and use `--student_pretrained_weights` argument to use this initialization for the distilled training!

 Happy distillation!
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -12,28 +12,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" The distiller to distil DistilBERT
-    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
+""" The distiller to distil the student.
+    Adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
 import os
 import math
+import psutil
+import time
 from tensorboardX import SummaryWriter
 from tqdm import trange, tqdm
 import numpy as np
+import psutil

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.optim import AdamW
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data import RandomSampler, BatchSampler, DataLoader

-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import WarmupLinearSchedule

 from utils import logger
-from dataset import Dataset
+from lm_seqs_dataset import LmSeqsDataset
+from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups

 class Distiller:
    def __init__(self,
                 params: dict,
-                 dataloader: Dataset,
+                 dataset: LmSeqsDataset,
                 token_probs: torch.tensor,
                 student: nn.Module,
                 teacher: nn.Module):
@@ -46,31 +53,47 @@ class Distiller:
        self.student = student
        self.teacher = teacher

-        self.dataloader = dataloader
-        if self.params.n_gpu > 1:
-            self.dataloader.split()
-        self.get_iterator(seed=params.seed)
+        self.student_config = student.config
+        self.vocab_size = student.config.vocab_size
+
+        if params.n_gpu <= 1:
+            sampler = RandomSampler(dataset)
+        else:
+            sampler = DistributedSampler(dataset)
+
+        if params.group_by_size:
+            groups = create_lengths_groups(lengths=dataset.lengths, k=params.max_model_input_size)
+            sampler = GroupedBatchSampler(sampler=sampler, group_ids=groups, batch_size=params.batch_size)
+        else:
+            sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False)
+
+        self.dataloader = DataLoader(dataset=dataset,
+                                     batch_sampler=sampler,
+                                     collate_fn=dataset.batch_sequences)

        self.temperature = params.temperature
        assert self.temperature > 0.

        self.alpha_ce = params.alpha_ce
        self.alpha_mlm = params.alpha_mlm
+        self.alpha_clm = params.alpha_clm
        self.alpha_mse = params.alpha_mse
-        assert self.alpha_ce >= 0.
-        assert self.alpha_mlm >= 0.
-        assert self.alpha_mse >= 0.
-        assert self.alpha_ce + self.alpha_mlm + self.alpha_mse > 0.
+        self.alpha_cos = params.alpha_cos

-        self.mlm_mask_prop = params.mlm_mask_prop
-        assert 0.0 <= self.mlm_mask_prop <= 1.0
-        assert params.word_mask + params.word_keep + params.word_rand == 1.0
-        self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
-        self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs
-        self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs
-        if self.fp16:
-            self.pred_probs = self.pred_probs.half()
-            self.token_probs = self.token_probs.half()
+        self.mlm = params.mlm
+        if self.mlm:
+            logger.info(f'Using MLM loss for LM step.')
+            self.mlm_mask_prop = params.mlm_mask_prop
+            assert 0.0 <= self.mlm_mask_prop <= 1.0
+            assert params.word_mask + params.word_keep + params.word_rand == 1.0
+            self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
+            self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs
+            self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs
+            if self.fp16:
+                self.pred_probs = self.pred_probs.half()
+                self.token_probs = self.token_probs.half()
+        else:
+            logger.info(f'Using CLM loss for LM step.')

        self.epoch = 0
        self.n_iter = 0
@@ -80,17 +103,22 @@ class Distiller:
        self.last_loss = 0
        self.last_loss_ce = 0
        self.last_loss_mlm = 0
-        self.last_loss_mse = 0
+        self.last_loss_clm = 0
+        if self.alpha_mse > 0.: self.last_loss_mse = 0
+        if self.alpha_cos > 0.: self.last_loss_cos = 0
+        self.last_log = 0

        self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
-        self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
-        self.mse_loss_fct = nn.MSELoss(reduction='sum')
+        self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
+        if self.alpha_mse > 0.:
+            self.mse_loss_fct = nn.MSELoss(reduction='sum')
+        if self.alpha_cos > 0.:
+            self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction='mean')

        logger.info('--- Initializing model optimizer')
        assert params.gradient_accumulation_steps >= 1
-        self.num_steps_epoch = int(len(self.dataloader) / params.batch_size) + 1
+        self.num_steps_epoch = len(self.dataloader)
        num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
-        warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
@@ -103,9 +131,11 @@ class Distiller:
                               lr=params.learning_rate,
                               eps=params.adam_epsilon,
                               betas=(0.9, 0.98))
+
+        warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
        self.scheduler = WarmupLinearSchedule(self.optimizer,
-                                              warmup_steps=warmup_steps,
-                                              t_total=num_train_optimization_steps)
+                                                warmup_steps=warmup_steps,
+                                                t_total=num_train_optimization_steps)

        if self.fp16:
            try:
@@ -128,43 +158,18 @@ class Distiller:
                logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
                self.student = DistributedDataParallel(self.student,
                                                       device_ids=[params.local_rank],
-                                                       output_device=params.local_rank)
+                                                       output_device=params.local_rank,
+                                                       find_unused_parameters=True)

        self.is_master = params.is_master
        if self.is_master:
            logger.info('--- Initializing Tensorboard')
            self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, 'log', 'train'))
-            self.tensorboard.add_text(tag='config', text_string=str(self.params), global_step=0)
+            self.tensorboard.add_text(tag='config/training', text_string=str(self.params), global_step=0)
+            self.tensorboard.add_text(tag='config/student', text_string=str(self.student_config), global_step=0)

-    def get_iterator(self,
-                     seed: int = None):
-        """
-        Initialize the data iterator.
-        Each process has its own data iterator (iterating on his own random portion of the dataset).
-
-        Input:
-        ------
-            seed: `int` - The random seed.
-        """
-        logger.info('--- Initializing Data Iterator')
-        self.data_iterator = self.dataloader.get_iterator(seed=seed)
-
-    def get_batch(self):
-        """
-        Call the data iterator to output a new batch.
-        If the data iterator went through the whole dataset, create a new iterator.
-        """
-        assert hasattr(self, 'data_iterator')
-        try:
-            x = next(self.data_iterator)
-        except StopIteration:
-            logger.warning('--- Went through the whole dataset. Creating new data iterator.')
-            self.data_iterator = self.dataloader.get_iterator()
-            x = next(self.data_iterator)
-        return x
-
-    def prepare_batch(self,
-                      batch):
+    def prepare_batch_mlm(self,
+                          batch):
        """
        Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM.

@@ -192,7 +197,7 @@ class Distiller:
        x_prob = self.token_probs[token_ids.flatten()]
        n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item())
        tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False)
-        pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.uint8, device=token_ids.device)
+        pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.bool, device=token_ids.device) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
        pred_mask[tgt_ids] = 1
        pred_mask = pred_mask.view(bs, max_seq_len)

@@ -210,16 +215,49 @@ class Distiller:
                assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item()

        _token_ids_real = token_ids[pred_mask]
-        _token_ids_rand = _token_ids_real.clone().random_(self.params.vocab_size)
+        _token_ids_rand = _token_ids_real.clone().random_(self.vocab_size)
        _token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids['mask_token'])
        probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True)
        _token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
        token_ids = token_ids.masked_scatter(pred_mask, _token_ids)

-        mlm_labels[1-pred_mask] = -1
+        mlm_labels[~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
+
+        # sanity checks
+        assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size

        return token_ids, attn_mask, mlm_labels

+    def prepare_batch_clm(self,
+                          batch):
+        """
+        Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM.
+
+        Input:
+        ------
+            batch: `Tuple`
+                token_ids: `torch.tensor(bs, seq_length)` - The token ids for each of the sequence. It is padded.
+                lengths: `torch.tensor(bs)` - The lengths of each of the sequences in the batch.
+
+        Output:
+        -------
+            token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
+            attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
+            clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -1 where there is nothing to predict.
+        """
+        token_ids, lengths = batch
+        token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
+        assert token_ids.size(0) == lengths.size(0)
+
+        attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
+        clm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
+        clm_labels[~attn_mask] = -1 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
+
+        # sanity checks
+        assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
+
+        return token_ids, attn_mask, clm_labels
+
    def round_batch(self,
                    x: torch.tensor,
                    lengths: torch.tensor):
@@ -257,7 +295,10 @@ class Distiller:
        if ml1 % 8 != 0:
            pad = 8 - (ml1 % 8)
            ml2 = ml1 + pad
-            pad_id = self.params.special_tok_ids['pad_token']
+            if self.mlm:
+                pad_id = self.params.special_tok_ids['pad_token']
+            else:
+                pad_id = self.params.special_tok_ids['unk_token']
            padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id)
            x = torch.cat([x, padding_tensor], 1)
            assert x.size() == (bs2, ml2)
@@ -271,20 +312,25 @@ class Distiller:
        The real training loop.
        """
        if self.is_master: logger.info('Starting training')
+        self.last_log = time.time()
        self.student.train()
        self.teacher.eval()

        for _ in range(self.params.n_epoch):
            if self.is_master: logger.info(f'--- Starting epoch {self.epoch}/{self.params.n_epoch-1}')
+            if self.multi_gpu:
+                torch.distributed.barrier()

-            iter_bar = trange(self.num_steps_epoch, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
-            for __ in range(self.num_steps_epoch):
-                batch = self.get_batch()
+            iter_bar = tqdm(self.dataloader, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
+            for batch in iter_bar:
                if self.params.n_gpu > 0:
                    batch = tuple(t.to(f'cuda:{self.params.local_rank}') for t in batch)
-                token_ids, attn_mask, mlm_labels = self.prepare_batch(batch=batch)

-                self.step(input_ids=token_ids, attention_mask=attn_mask, mlm_labels=mlm_labels)
+                if self.mlm:
+                    token_ids, attn_mask, lm_labels = self.prepare_batch_mlm(batch=batch)
+                else:
+                    token_ids, attn_mask, lm_labels = self.prepare_batch_clm(batch=batch)
+                self.step(input_ids=token_ids, attention_mask=attn_mask, lm_labels=lm_labels)

                iter_bar.update()
                iter_bar.set_postfix({'Last_loss': f'{self.last_loss:.2f}',
@@ -294,12 +340,15 @@ class Distiller:
            if self.is_master: logger.info(f'--- Ending epoch {self.epoch}/{self.params.n_epoch-1}')
            self.end_epoch()

-        if self.is_master: logger.info('Training is finished')
+        if self.is_master:
+            logger.info(f'Save very last checkpoint as `pytorch_model.bin`.')
+            self.save_checkpoint(checkpoint_name=f'pytorch_model.bin')
+            logger.info('Training is finished')

    def step(self,
             input_ids: torch.tensor,
             attention_mask: torch.tensor,
-             mlm_labels: torch.tensor):
+             lm_labels: torch.tensor):
        """
        One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation),
        and possibly a parameter update (depending on the gradient accumulation).
@@ -308,17 +357,22 @@ class Distiller:
        ------
        input_ids: `torch.tensor(bs, seq_length)` - The token ids.
        attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention.
-        mlm_labels: `torch.tensor(bs, seq_length)` - The masked language modeling labels.
+        lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM).
        """
-        s_logits = self.student(input_ids=input_ids, attention_mask=attention_mask)[0]     # (bs, seq_length, voc_size)
-        with torch.no_grad():
-            t_logits = self.teacher(input_ids=input_ids, attention_mask=attention_mask)[0] # (bs, seq_length, voc_size)
+        if self.mlm:
+            s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask)     # (bs, seq_length, voc_size)
+            with torch.no_grad():
+                t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
+        else:
+            s_logits, _, s_hidden_states = self.student(input_ids=input_ids, attention_mask=None)            # (bs, seq_length, voc_size)
+            with torch.no_grad():
+                t_logits, _, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=None)           # (bs, seq_length, voc_size)
        assert s_logits.size() == t_logits.size()

        #https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
        #https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
        if self.params.restrict_ce_to_mask:
-            mask = (mlm_labels>-1).unsqueeze(-1).expand_as(s_logits)   # (bs, seq_lenth, voc_size)
+            mask = (lm_labels>-1).unsqueeze(-1).expand_as(s_logits)    # (bs, seq_lenth, voc_size)
        else:
            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)    # (bs, seq_lenth, voc_size)
        s_logits_slct = torch.masked_select(s_logits, mask)            # (bs * seq_length * voc_size) modulo the 1s in mask
@@ -330,20 +384,47 @@ class Distiller:
        loss_ce = self.ce_loss_fct(F.log_softmax(s_logits_slct/self.temperature, dim=-1),
                                   F.softmax(t_logits_slct/self.temperature, dim=-1)) * (self.temperature)**2
        loss = self.alpha_ce*loss_ce
+
        if self.alpha_mlm > 0.:
-            loss_mlm = self.mlm_loss_fct(s_logits.view(-1, s_logits.size(-1)), mlm_labels.view(-1))
+            loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1))
            loss += self.alpha_mlm * loss_mlm
+        if self.alpha_clm > 0.:
+            shift_logits = s_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                                        shift_labels.view(-1))
+            loss += self.alpha_clm * loss_clm
+
        if self.alpha_mse > 0.:
            loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction
            loss += self.alpha_mse * loss_mse
+        if self.alpha_cos > 0.:
+            s_hidden_states = s_hidden_states[-1]                              # (bs, seq_length, dim)
+            t_hidden_states = t_hidden_states[-1]                              # (bs, seq_length, dim)
+            mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states)     # (bs, seq_length, dim)
+            assert s_hidden_states.size() == t_hidden_states.size()
+            dim = s_hidden_states.size(-1)
+            
+            s_hidden_states_slct = torch.masked_select(s_hidden_states, mask)        # (bs * seq_length * dim)
+            s_hidden_states_slct = s_hidden_states_slct.view(-1, dim)                # (bs * seq_length, dim)
+            t_hidden_states_slct = torch.masked_select(t_hidden_states, mask)        # (bs * seq_length * dim)
+            t_hidden_states_slct = t_hidden_states_slct.view(-1, dim)                # (bs * seq_length, dim)
+        
+            target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,)
+            loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target)
+            loss += self.alpha_cos * loss_cos

        self.total_loss_epoch += loss.item()
        self.last_loss = loss.item()
        self.last_loss_ce = loss_ce.item()
        if self.alpha_mlm > 0.:
            self.last_loss_mlm = loss_mlm.item()
+        if self.alpha_clm > 0.:
+            self.last_loss_clm = loss_clm.item()
        if self.alpha_mse > 0.:
            self.last_loss_mse = loss_mse.item()
+        if self.alpha_cos > 0.:
+            self.last_loss_cos = loss_cos.item()

        self.optimize(loss)

@@ -379,9 +460,9 @@ class Distiller:
                torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
-            self.scheduler.step()
            self.optimizer.step()
            self.optimizer.zero_grad()
+            self.scheduler.step()

    def iter(self):
        """
@@ -392,6 +473,7 @@ class Distiller:

        if self.n_total_iter % self.params.log_interval == 0:
            self.log_tensorboard()
+            self.last_log = time.time()
        if self.n_total_iter % self.params.checkpoint_interval == 0:
            self.save_checkpoint()

@@ -415,9 +497,16 @@ class Distiller:
        self.tensorboard.add_scalar(tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter)
        if self.alpha_mlm > 0.:
            self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter)
+        if self.alpha_clm > 0.:
+            self.tensorboard.add_scalar(tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter)
        if self.alpha_mse > 0.:
            self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
+        if self.alpha_cos > 0.:
+            self.tensorboard.add_scalar(tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter)
        self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter)
+        
+        self.tensorboard.add_scalar(tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used']/1_000_000, global_step=self.n_total_iter)
+        self.tensorboard.add_scalar(tag="global/speed", scalar_value=time.time()-self.last_log, global_step=self.n_total_iter)

    def end_epoch(self):
        """
--- a/examples/distillation/grouped_batch_sampler.py
+++ b/examples/distillation/grouped_batch_sampler.py
@@ -0,0 +1,105 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Adapted from PyTorch Vision (https://github.com/pytorch/vision/blob/master/references/detection/group_by_aspect_ratio.py)
+"""
+import bisect
+import copy
+from collections import defaultdict
+import numpy as np
+
+from torch.utils.data.sampler import BatchSampler, Sampler
+
+from utils import logger
+
+def _quantize(x, bins):
+    bins = copy.deepcopy(bins)
+    bins = sorted(bins)
+    quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
+    return quantized
+
+def create_lengths_groups(lengths, k=0):
+    bins = np.arange(start=3, stop=k, step=4).tolist() if k > 0 else [10]
+    groups = _quantize(lengths, bins)
+    # count number of elements per group
+    counts = np.unique(groups, return_counts=True)[1]
+    fbins = [0] + bins + [np.inf]
+    logger.info("Using {} as bins for aspect lengths quantization".format(fbins))
+    logger.info("Count of instances per bin: {}".format(counts))
+    return groups
+
+class GroupedBatchSampler(BatchSampler):
+    """
+    Wraps another sampler to yield a mini-batch of indices.
+    It enforces that the batch only contain elements from the same group.
+    It also tries to provide mini-batches which follows an ordering which is
+    as close as possible to the ordering from the original sampler.
+    Arguments:
+        sampler (Sampler): Base sampler.
+        group_ids (list[int]): If the sampler produces indices in range [0, N),
+            `group_ids` must be a list of `N` ints which contains the group id of each sample.
+            The group ids must be a continuous set of integers starting from
+            0, i.e. they must be in the range [0, num_groups).
+        batch_size (int): Size of mini-batch.
+    """
+    def __init__(self, sampler, group_ids, batch_size):
+        if not isinstance(sampler, Sampler):
+            raise ValueError(
+                "sampler should be an instance of "
+                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
+            )
+        self.sampler = sampler
+        self.group_ids = group_ids
+        self.batch_size = batch_size
+
+    def __iter__(self):
+        buffer_per_group = defaultdict(list)
+        samples_per_group = defaultdict(list)
+
+        num_batches = 0
+        for idx in self.sampler:
+            group_id = self.group_ids[idx]
+            buffer_per_group[group_id].append(idx)
+            samples_per_group[group_id].append(idx)
+            if len(buffer_per_group[group_id]) == self.batch_size:
+                yield buffer_per_group[group_id] #TODO
+                num_batches += 1
+                del buffer_per_group[group_id]
+            assert len(buffer_per_group[group_id]) < self.batch_size
+
+        # now we have run out of elements that satisfy
+        # the group criteria, let's return the remaining
+        # elements so that the size of the sampler is
+        # deterministic
+        expected_num_batches = len(self)
+        num_remaining = expected_num_batches - num_batches
+        if num_remaining > 0:
+            # for the remaining batches, group the batches by similar lengths
+            batch_idx = []
+            for group_id, idxs in sorted(buffer_per_group.items(), key=lambda x: x[0]):
+                batch_idx.extend(idxs)
+                if len(batch_idx) >= self.batch_size:
+                    yield batch_idx[:self.batch_size]
+                    batch_idx = batch_idx[self.batch_size:]
+                    num_remaining -= 1
+            if len(batch_idx) > 0:
+                yield batch_idx
+                num_remaining -= 1
+        assert num_remaining == 0
+
+    def __len__(self):
+        """
+        Return the number of mini-batches rather than the number of samples.
+        """
+        return (len(self.sampler) + self.batch_size - 1) // self.batch_size
--- a/examples/distillation/lm_seqs_dataset.py
+++ b/examples/distillation/lm_seqs_dataset.py
@@ -12,30 +12,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Dataloaders to train DistilBERT
+""" Dataset to distilled models
    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
-from typing import List
-import math
-from itertools import chain
-from collections import Counter
-import numpy as np
 import torch
+from torch.utils.data import Dataset

+import numpy as np
 from utils import logger

-class Dataset:
+class LmSeqsDataset(Dataset):
+    """Custom Dataset wrapping language modeling sequences.
+
+    Each sample will be retrieved by indexing the list of token_ids and their corresponding lengths.
+
+    Input:
+    ------
+        params: `NameSpace` parameters
+        data: `List[np.array[int]]
+    """
+
    def __init__(self,
                 params,
                 data):
        self.params = params
-        self.tokens_per_batch = params.tokens_per_batch
-        self.batch_size = params.batch_size
-        self.shuffle = params.shuffle
-        self.group_by_size = params.group_by_size

        self.token_ids = np.array(data)
-        self.lengths = np.uint16([len(t) for t in data])
+        self.lengths = np.array([len(t) for t in data])

        self.check()
        self.remove_long_sequences()
@@ -43,6 +46,9 @@ class Dataset:
        self.check()
        self.print_statistics()

+    def __getitem__(self, index):
+        return (self.token_ids[index], self.lengths[index])
+
    def __len__(self):
        return len(self.lengths)

@@ -51,12 +57,14 @@ class Dataset:
        Some sanity checks
        """
        assert len(self.token_ids) == len(self.lengths)
+        assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths))) 

    def remove_long_sequences(self):
        """
-        Sequences that are too long are splitted by chunk of max_position_embeddings.
+        Sequences that are too long are splitted by chunk of max_model_input_size.
        """
-        indices = self.lengths >= self.params.max_position_embeddings
+        max_len = self.params.max_model_input_size
+        indices = self.lengths > max_len
        logger.info(f'Splitting {sum(indices)} too long sequences.')

        def divide_chunks(l, n):
@@ -64,10 +72,13 @@ class Dataset:

        new_tok_ids = []
        new_lengths = []
-        cls_id, sep_id = self.params.special_tok_ids['cls_token'], self.params.special_tok_ids['sep_token']
-        max_len = self.params.max_position_embeddings
+        if self.params.mlm:
+            cls_id, sep_id = self.params.special_tok_ids['cls_token'], self.params.special_tok_ids['sep_token']
+        else:
+            cls_id, sep_id = self.params.special_tok_ids['bos_token'], self.params.special_tok_ids['eos_token']

        for seq_, len_ in zip(self.token_ids, self.lengths):
+            assert (seq_[0] == cls_id) and (seq_[-1] == sep_id), seq_
            if len_ <= max_len:
                new_tok_ids.append(seq_)
                new_lengths.append(len_)
@@ -77,8 +88,9 @@ class Dataset:
                    if sub_s[0] != cls_id:
                        sub_s = np.insert(sub_s, 0, cls_id)
                    if sub_s[-1] != sep_id:
-                        sub_s = np.insert(sub_s, len(sub_s), cls_id)
+                        sub_s = np.insert(sub_s, len(sub_s), sep_id)
                    assert len(sub_s) <= max_len
+                    assert (sub_s[0] == cls_id) and (sub_s[-1] == sep_id), sub_s
                    sub_seqs.append(sub_s)

                new_tok_ids.extend(sub_seqs)
@@ -92,11 +104,11 @@ class Dataset:
        Too short sequences are simply removed. This could be tunedd.
        """
        init_size = len(self)
-        indices = self.lengths > 5
+        indices = self.lengths > 11
        self.token_ids = self.token_ids[indices]
        self.lengths = self.lengths[indices]
        new_size = len(self)
-        logger.info(f'Remove {init_size - new_size} too short (<=5 tokens) sequences.')
+        logger.info(f'Remove {init_size - new_size} too short (<=11 tokens) sequences.')

    def print_statistics(self):
        """
@@ -113,89 +125,27 @@ class Dataset:
        # nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
        # logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')

-    def select_data(self, a: int, b: int):
-        """
-        Select a subportion of the data.
-        """
-        n_sequences = len(self)
-        assert 0 <= a < b <= n_sequences, ValueError(f'`0 <= a < b <= n_sequences` is not met with a={a} and b={b}')
-
-        logger.info(f'Selecting sequences from {a} to {b} (excluded).')
-        self.token_ids = self.token_ids[a:b]
-        self.lengths = self.lengths[a:b]
-
-        self.check()
-
-    def split(self):
-        """
-        Distributed training: split the data accross the processes.
-        """
-        assert self.params.n_gpu > 1
-        logger.info('Splitting the data accross the processuses.')
-        n_seq = len(self)
-        n_seq_per_procesus = n_seq // self.params.world_size
-        a = n_seq_per_procesus * self.params.global_rank
-        b = a + n_seq_per_procesus
-        self.select_data(a=a, b=b)
-
    def batch_sequences(self,
-                        token_ids: List[List[int]],
-                        lengths: List[int]):
+                        batch):
        """
        Do the padding and transform into torch.tensor.
        """
+        token_ids = [t[0] for t in batch]
+        lengths = [t[1] for t in batch]
        assert len(token_ids) == len(lengths)

        # Max for paddings
        max_seq_len_ = max(lengths)

        # Pad token ids
-        pad_idx = self.params.special_tok_ids['pad_token']
+        if self.params.mlm:
+            pad_idx = self.params.special_tok_ids['pad_token']
+        else:
+            pad_idx = self.params.special_tok_ids['unk_token']
        tk_ = [list(t.astype(int)) + [pad_idx]*(max_seq_len_-len(t)) for t in token_ids]
        assert len(tk_) == len(token_ids)
        assert all(len(t) == max_seq_len_ for t in tk_)

-        tk_t = torch.tensor(tk_)                  # (bs, max_seq_len_)
-        lg_t = torch.tensor(lengths.astype(int))  # (bs)
+        tk_t = torch.tensor(tk_)      # (bs, max_seq_len_)
+        lg_t = torch.tensor(lengths)  # (bs)
        return tk_t, lg_t
-
-    def get_batches_iterator(self,
-                             batches):
-        """
-        Return an iterator over batches.
-        """
-        for sequences_ids in batches:
-            token_ids, lengths = self.batch_sequences(self.token_ids[sequences_ids],
-                                                    self.lengths[sequences_ids])
-            yield (token_ids, lengths)
-
-    def get_iterator(self,
-                     seed: int = None):
-        """
-        Return a data iterator.
-        """
-        rng = np.random.RandomState(seed)
-
-        n_sequences = len(self)
-        indices = np.arange(n_sequences)
-
-        if self.group_by_size:
-            indices = indices[np.argsort(self.lengths[indices], kind='mergesort')]
-
-        if self.tokens_per_batch == -1:
-            batches = np.array_split(indices, math.ceil(len(indices) * 1. / self.batch_size))
-        else:
-            assert self.tokens_per_batch > 0
-            batch_ids = np.cumsum(self.lengths[indices]) // self.tokens_per_batch
-            _, bounds = np.unique(batch_ids, return_index=True)
-            batches = [indices[bounds[i]:bounds[i + 1]] for i in range(len(bounds) - 1)]
-            if bounds[-1] < len(indices):
-                batches.append(indices[bounds[-1]:])
-
-        if self.shuffle:
-            rng.shuffle(batches)
-
-        assert n_sequences == sum([len(x) for x in batches])
-        assert self.lengths[indices].sum() == sum([self.lengths[x].sum() for x in batches])
-
-        return self.get_batches_iterator(batches=batches)
--- a/examples/distillation/requirements.txt
+++ b/examples/distillation/requirements.txt
@@ -1 +1,6 @@
 gitpython==3.0.2
+tensorboard>=1.14.0
+tensorboardX==1.8
+psutil==5.6.3
+scipy==1.3.1
+transformers==2.0.0
--- a/examples/distillation/run_squad_w_distillation.py
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -0,0 +1,585 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" This is the exact same script as `examples/run_squad.py` (as of 2019, October 4th) with an additional and optional step of distillation."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import os
+import random
+import glob
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+import torch.nn.functional as F
+import torch.nn as nn
+from tqdm import tqdm, trange
+
+from tensorboardX import SummaryWriter
+
+from transformers import (WEIGHTS_NAME, BertConfig,
+                                  BertForQuestionAnswering, BertTokenizer,
+                                  XLMConfig, XLMForQuestionAnswering,
+                                  XLMTokenizer, XLNetConfig,
+                                  XLNetForQuestionAnswering,
+                                  XLNetTokenizer,
+                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+
+from transformers import AdamW, WarmupLinearSchedule
+
+from ..utils_squad import (read_squad_examples, convert_examples_to_features,
+                         RawResult, write_predictions,
+                         RawResultExtended, write_predictions_extended)
+
+# The follwing import is the official SQuAD evaluation script (2.0).
+# You can remove it from the dependencies if you are using this script outside of the library
+# We've added it here for automated tests (see examples/test_examples.py file)
+from ..utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
+
+logger = logging.getLogger(__name__)
+
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
+                  for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
+
+MODEL_CLASSES = {
+    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
+    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
+    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
+    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+}
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+def to_list(tensor):
+    return tensor.detach().cpu().tolist()
+
+def train(args, train_dataset, model, tokenizer, teacher=None):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            if teacher is not None:
+                teacher.eval()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {'input_ids':       batch[0],
+                      'attention_mask':  batch[1], 
+                      'start_positions': batch[3], 
+                      'end_positions':   batch[4]}
+            if args.model_type != 'distilbert':
+                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
+            if args.model_type in ['xlnet', 'xlm']:
+                inputs.update({'cls_index': batch[5],
+                               'p_mask':       batch[6]})
+            outputs = model(**inputs)
+            loss, start_logits_stu, end_logits_stu = outputs
+
+            # Distillation loss
+            if teacher is not None:
+                if 'token_type_ids' not in inputs:
+                    inputs['token_type_ids'] = None if args.teacher_type == 'xlm' else batch[2]
+                with torch.no_grad():
+                    start_logits_tea, end_logits_tea = teacher(input_ids=inputs['input_ids'],
+                                                               token_type_ids=inputs['token_type_ids'],
+                                                               attention_mask=inputs['attention_mask'])
+                assert start_logits_tea.size() == start_logits_stu.size()
+                assert end_logits_tea.size() == end_logits_stu.size()
+                
+                loss_fct = nn.KLDivLoss(reduction='batchmean')
+                loss_start = loss_fct(F.log_softmax(start_logits_stu/args.temperature, dim=-1),
+                                      F.softmax(start_logits_tea/args.temperature, dim=-1)) * (args.temperature**2)
+                loss_end = loss_fct(F.log_softmax(end_logits_stu/args.temperature, dim=-1),
+                                    F.softmax(end_logits_tea/args.temperature, dim=-1)) * (args.temperature**2)
+                loss_ce = (loss_start + loss_end)/2.
+
+                loss = args.alpha_ce*loss_ce + args.alpha_squad*loss
+
+            if args.n_gpu > 1:
+                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+            else:
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
+
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(args.output_dir)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    # Note that DistributedSampler samples randomly
+    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
+    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # Eval!
+    logger.info("***** Running evaluation {} *****".format(prefix))
+    logger.info("  Num examples = %d", len(dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    all_results = []
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        model.eval()
+        batch = tuple(t.to(args.device) for t in batch)
+        with torch.no_grad():
+            inputs = {'input_ids':      batch[0],
+                      'attention_mask': batch[1]
+                      }
+            if args.model_type != 'distilbert':
+                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+            example_indices = batch[3]
+            if args.model_type in ['xlnet', 'xlm']:
+                inputs.update({'cls_index': batch[4],
+                               'p_mask':    batch[5]})
+            outputs = model(**inputs)
+
+        for i, example_index in enumerate(example_indices):
+            eval_feature = features[example_index.item()]
+            unique_id = int(eval_feature.unique_id)
+            if args.model_type in ['xlnet', 'xlm']:
+                # XLNet uses a more complex post-processing procedure
+                result = RawResultExtended(unique_id            = unique_id,
+                                           start_top_log_probs  = to_list(outputs[0][i]),
+                                           start_top_index      = to_list(outputs[1][i]),
+                                           end_top_log_probs    = to_list(outputs[2][i]),
+                                           end_top_index        = to_list(outputs[3][i]),
+                                           cls_logits           = to_list(outputs[4][i]))
+            else:
+                result = RawResult(unique_id    = unique_id,
+                                   start_logits = to_list(outputs[0][i]),
+                                   end_logits   = to_list(outputs[1][i]))
+            all_results.append(result)
+
+    # Compute predictions
+    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
+    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
+    if args.version_2_with_negative:
+        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
+    else:
+        output_null_log_odds_file = None
+
+    if args.model_type in ['xlnet', 'xlm']:
+        # XLNet uses a more complex post-processing procedure
+        write_predictions_extended(examples, features, all_results, args.n_best_size,
+                        args.max_answer_length, output_prediction_file,
+                        output_nbest_file, output_null_log_odds_file, args.predict_file,
+                        model.config.start_n_top, model.config.end_n_top,
+                        args.version_2_with_negative, tokenizer, args.verbose_logging)
+    else:
+        write_predictions(examples, features, all_results, args.n_best_size,
+                        args.max_answer_length, args.do_lower_case, output_prediction_file,
+                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
+                        args.version_2_with_negative, args.null_score_diff_threshold)
+
+    # Evaluate with the official SQuAD script
+    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
+                                 pred_file=output_prediction_file,
+                                 na_prob_file=output_null_log_odds_file)
+    results = evaluate_on_squad(evaluate_options)
+    return results
+
+
+def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
+    if args.local_rank not in [-1, 0] and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Load data features from cache or dataset file
+    input_file = args.predict_file if evaluate else args.train_file
+    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
+        'dev' if evaluate else 'train',
+        list(filter(None, args.model_name_or_path.split('/'))).pop(),
+        str(args.max_seq_length)))
+    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", input_file)
+        examples = read_squad_examples(input_file=input_file,
+                                                is_training=not evaluate,
+                                                version_2_with_negative=args.version_2_with_negative)
+        features = convert_examples_to_features(examples=examples,
+                                                tokenizer=tokenizer,
+                                                max_seq_length=args.max_seq_length,
+                                                doc_stride=args.doc_stride,
+                                                max_query_length=args.max_query_length,
+                                                is_training=not evaluate)
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    if args.local_rank == 0 and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
+    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
+    if evaluate:
+        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                all_example_index, all_cls_index, all_p_mask)
+    else:
+        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
+        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                all_start_positions, all_end_positions,
+                                all_cls_index, all_p_mask)
+
+    if output_examples:
+        return dataset, examples, features
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--train_file", default=None, type=str, required=True,
+                        help="SQuAD json for training. E.g., train-v1.1.json")
+    parser.add_argument("--predict_file", default=None, type=str, required=True,
+                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model checkpoints and predictions will be written.")
+
+    # Distillation parameters (optional)
+    parser.add_argument('--teacher_type', default=None, type=str,
+                        help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.")
+    parser.add_argument('--teacher_name_or_path', default=None, type=str,
+                        help="Path to the already SQuAD fine-tuned teacher model. Only for distillation.")
+    parser.add_argument('--alpha_ce', default=0.5, type=float,
+                        help="Distillation loss linear weight. Only for distillation.")
+    parser.add_argument('--alpha_squad', default=0.5, type=float,
+                        help="True SQuAD loss linear weight. Only for distillation.")
+    parser.add_argument('--temperature', default=2.0, type=float,
+                        help="Distillation temperature. Only for distillation.")
+
+    ## Other parameters
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+
+    parser.add_argument('--version_2_with_negative', action='store_true',
+                        help='If true, the SQuAD examples contain some that do not have an answer.')
+    parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
+                        help="If null_score - best_non_null is greater than the threshold predict null.")
+
+    parser.add_argument("--max_seq_length", default=384, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+                             "longer than this will be truncated, and sequences shorter than this will be padded.")
+    parser.add_argument("--doc_stride", default=128, type=int,
+                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
+    parser.add_argument("--max_query_length", default=64, type=int,
+                        help="The maximum number of tokens for the question. Questions longer than this will "
+                             "be truncated to this length.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--evaluate_during_training", action='store_true',
+                        help="Rul evaluation during training at each logging step.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--n_best_size", default=20, type=int,
+                        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
+    parser.add_argument("--max_answer_length", default=30, type=int,
+                        help="The maximum length of an answer that can be generated. This is needed because the start "
+                             "and end predictions are not conditioned on one another.")
+    parser.add_argument("--verbose_logging", action='store_true',
+                        help="If true, all of the warnings related to data processing will be printed. "
+                             "A number of warnings are expected for a normal SQuAD evaluation.")
+
+    parser.add_argument('--logging_steps', type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument('--save_steps', type=int, default=50,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action='store_true',
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument('--overwrite_output_dir', action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--fp16', action='store_true',
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    parser.add_argument('--fp16_opt_level', type=str, default='O1',
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html")
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend='nccl')
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+
+    # Set seed
+    set_seed(args)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+
+    if args.teacher_type is not None:
+        assert args.teacher_name_or_path is not None
+        assert args.alpha_ce > 0.
+        assert args.alpha_ce + args.alpha_squad > 0.
+        assert args.teacher_type != 'distilbert', "We constraint teachers not to be of type DistilBERT."
+        teacher_config_class, teacher_model_class, _ = MODEL_CLASSES[args.teacher_type]
+        teacher_config = teacher_config_class.from_pretrained(args.teacher_name_or_path)
+        teacher = teacher_model_class.from_pretrained(args.teacher_name_or_path, config=teacher_config)
+        teacher.to(args.device)
+    else:
+        teacher = None
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    model.to(args.device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher=teacher)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+
+    # Save the trained model and the tokenizer
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        model.to(args.device)
+
+
+    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+
+        for checkpoint in checkpoints:
+            # Reload the model
+            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+
+            # Evaluate
+            result = evaluate(args, model, tokenizer, prefix=global_step)
+
+            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            results.update(result)
+
+    logger.info("Results: {}".format(results))
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -13,31 +13,46 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Preprocessing script before training DistilBERT.
+Preprocessing script before distillation.
 """
 import argparse
 import pickle
 import random
 import time
 import numpy as np
-from pytorch_transformers import BertTokenizer
+from transformers import BertTokenizer, RobertaTokenizer, GPT2Tokenizer
+import logging

-from examples.distillation.utils import logger
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)

 def main():
    parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
    parser.add_argument('--file_path', type=str, default='data/dump.txt',
                        help='The path to the data.')
-    parser.add_argument('--bert_tokenizer', type=str, default='bert-base-uncased',
+    parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2'])
+    parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
                        help="The tokenizer to use.")
    parser.add_argument('--dump_file', type=str, default='data/dump',
                        help='The dump file prefix.')
    args = parser.parse_args()


-    logger.info(f'Loading Tokenizer ({args.bert_tokenizer})')
-    bert_tokenizer = BertTokenizer.from_pretrained(args.bert_tokenizer)
-
+    logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
+    if args.tokenizer_type == 'bert':
+        tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
+        bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]`
+        sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]`
+    elif args.tokenizer_type == 'roberta':
+        tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
+        bos = tokenizer.special_tokens_map['cls_token'] # `<s>`
+        sep = tokenizer.special_tokens_map['sep_token'] # `</s>`
+    elif args.tokenizer_type == 'gpt2':
+        tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
+        bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>`
+        sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>`    

    logger.info(f'Loading text from {args.file_path}')
    with open(args.file_path, 'r', encoding='utf8') as fp:
@@ -52,8 +67,8 @@ def main():
    interval = 10000
    start = time.time()
    for text in data:
-        text = f'[CLS] {text.strip()} [SEP]'
-        token_ids = bert_tokenizer.encode(text)
+        text = f'{bos} {text.strip()} {sep}'
+        token_ids = tokenizer.encode(text)
        rslt.append(token_ids)

        iter += 1
@@ -65,7 +80,7 @@ def main():
    logger.info(f'{len(data)} examples processed.')


-    dp_file = f'{args.dump_file}.{args.bert_tokenizer}.pickle'
+    dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle'
    rslt_ = [np.uint16(d) for d in rslt]
    random.shuffle(rslt_)
    logger.info(f'Dump to {dp_file}')
@@ -74,4 +89,4 @@ def main():


 if __name__ == "__main__":
-    main()
+    main()
--- a/examples/distillation/scripts/extract.py
+++ b/examples/distillation/scripts/extract.py
@@ -0,0 +1,89 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocessing script before training the distilled model.
+Specific to RoBERTa -> DistilRoBERTa and GPT2 -> DistilGPT2.
+"""
+from transformers import BertForMaskedLM, RobertaForMaskedLM, GPT2LMHeadModel
+import torch
+import argparse
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation")
+    parser.add_argument("--model_type", default="roberta", choices=["roberta", "gpt2"])
+    parser.add_argument("--model_name", default='roberta-large', type=str)
+    parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_roberta_048131723.pth', type=str)
+    parser.add_argument("--vocab_transform", action='store_true')
+    args = parser.parse_args()
+
+
+    if args.model_type == 'roberta':
+        model = RobertaForMaskedLM.from_pretrained(args.model_name)
+        prefix = 'roberta'
+    elif args.model_type == 'gpt2':
+        model = GPT2LMHeadModel.from_pretrained(args.model_name)
+        prefix = 'transformer'
+
+    state_dict = model.state_dict()
+    compressed_sd = {}
+
+    ### Embeddings ###
+    if args.model_type == 'gpt2':
+        for param_name in ['wte.weight', 'wpe.weight']:
+            compressed_sd[f'{prefix}.{param_name}'] = state_dict[f'{prefix}.{param_name}']
+    else:
+        for w in ['word_embeddings', 'position_embeddings', 'token_type_embeddings']:
+            param_name = f'{prefix}.embeddings.{w}.weight'
+            compressed_sd[param_name] = state_dict[param_name]
+        for w in ['weight', 'bias']:
+            param_name = f'{prefix}.embeddings.LayerNorm.{w}'
+            compressed_sd[param_name] = state_dict[param_name]
+
+    ### Transformer Blocks ###
+    std_idx = 0
+    for teacher_idx in [0, 2, 4, 7, 9, 11]:
+        if args.model_type == 'gpt2':
+            for layer in ['ln_1', 'attn.c_attn', 'attn.c_proj', 'ln_2', 'mlp.c_fc', 'mlp.c_proj']:
+                for w in ['weight', 'bias']:
+                    compressed_sd[f'{prefix}.h.{std_idx}.{layer}.{w}'] = \
+                        state_dict[f'{prefix}.h.{teacher_idx}.{layer}.{w}']
+            compressed_sd[f'{prefix}.h.{std_idx}.attn.bias'] = state_dict[f'{prefix}.h.{teacher_idx}.attn.bias']
+        else:
+            for layer in ['attention.self.query', 'attention.self.key', 'attention.self.value',
+                        'attention.output.dense', 'attention.output.LayerNorm',
+                        'intermediate.dense', 'output.dense', 'output.LayerNorm']:
+                for w in ['weight', 'bias']:
+                    compressed_sd[f'{prefix}.encoder.layer.{std_idx}.{layer}.{w}'] = \
+                        state_dict[f'{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}']
+        std_idx += 1
+
+    ### Language Modeling Head ###s
+    if args.model_type == 'roberta':
+        for layer in ['lm_head.decoder.weight', 'lm_head.bias']:
+            compressed_sd[f'{layer}'] = state_dict[f'{layer}']
+        if args.vocab_transform:
+            for w in ['weight', 'bias']:
+                compressed_sd[f'lm_head.dense.{w}'] = state_dict[f'lm_head.dense.{w}']
+                compressed_sd[f'lm_head.layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}']
+    elif args.model_type == 'gpt2':
+        for w in ['weight', 'bias']:
+            compressed_sd[f'{prefix}.ln_f.{w}'] = state_dict[f'{prefix}.ln_f.{w}']
+        compressed_sd[f'lm_head.weight'] = state_dict[f'lm_head.weight']
+
+    print(f'N layers selected for distillation: {std_idx}')
+    print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
+
+    print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
+    torch.save(compressed_sd, args.dump_checkpoint)
--- a/examples/distillation/scripts/extract_distilbert.py
+++ b/examples/distillation/scripts/extract_distilbert.py
@@ -14,52 +14,58 @@
 # limitations under the License.
 """
 Preprocessing script before training DistilBERT.
+Specific to BERT -> DistilBERT.
 """
-from pytorch_transformers import BertForPreTraining
+from transformers import BertForMaskedLM, RobertaForMaskedLM
 import torch
 import argparse

 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForPreTraining for Transfer Learned Distillation")
-    parser.add_argument("--bert_model", default='bert-base-uncased', type=str)
-    parser.add_argument("--dump_checkpoint", default='serialization_dir/transfer_learning_checkpoint_0247911.pth', type=str)
+    parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation")
+    parser.add_argument("--model_type", default="bert", choices=["bert"])
+    parser.add_argument("--model_name", default='bert-base-uncased', type=str)
+    parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str)
    parser.add_argument("--vocab_transform", action='store_true')
    args = parser.parse_args()


-    model = BertForPreTraining.from_pretrained(args.bert_model)
+    if args.model_type == 'bert':
+        model = BertForMaskedLM.from_pretrained(args.model_name)
+        prefix = 'bert'
+    else:
+        raise ValueError(f'args.model_type should be "bert".')

    state_dict = model.state_dict()
    compressed_sd = {}

    for w in ['word_embeddings', 'position_embeddings']:
        compressed_sd[f'distilbert.embeddings.{w}.weight'] = \
-            state_dict[f'bert.embeddings.{w}.weight']
+            state_dict[f'{prefix}.embeddings.{w}.weight']
    for w in ['weight', 'bias']:
        compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \
-            state_dict[f'bert.embeddings.LayerNorm.{w}']
+            state_dict[f'{prefix}.embeddings.LayerNorm.{w}']

    std_idx = 0
    for teacher_idx in [0, 2, 4, 7, 9, 11]:
        for w in ['weight', 'bias']:
            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \
-                state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.query.{w}']
+                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}']
            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}'] = \
-                state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.key.{w}']
+                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}']
            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}'] = \
-                state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.value.{w}']
+                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}']

            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}'] = \
-                state_dict[f'bert.encoder.layer.{teacher_idx}.attention.output.dense.{w}']
+                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}']
            compressed_sd[f'distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}'] = \
-                state_dict[f'bert.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}']
+                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}']

            compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}'] = \
-                state_dict[f'bert.encoder.layer.{teacher_idx}.intermediate.dense.{w}']
+                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}']
            compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}'] = \
-                state_dict[f'bert.encoder.layer.{teacher_idx}.output.dense.{w}']
+                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}']
            compressed_sd[f'distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}'] = \
-                state_dict[f'bert.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
+                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
        std_idx += 1

    compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
--- a/examples/distillation/scripts/token_counts.py
+++ b/examples/distillation/scripts/token_counts.py
@@ -13,13 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Preprocessing script before training DistilBERT.
+Preprocessing script before training the distilled model.
 """
 from collections import Counter
 import argparse
 import pickle
+import logging

-from examples.distillation.utils import logger
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)

 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)")
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Training DistilBERT.
+Training the distilled model.
+Supported architectures include: BERT -> DistilBERT, RoBERTa -> DistilRoBERTa, GPT2 -> DistilGPT2.
 """
 import os
 import argparse
@@ -23,64 +24,96 @@ import shutil
 import numpy as np
 import torch

-from pytorch_transformers import BertTokenizer, BertForMaskedLM
-from pytorch_transformers import DistilBertForMaskedLM, DistilBertConfig
+from transformers import BertConfig, BertForMaskedLM, BertTokenizer
+from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizer
+from transformers import DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer
+from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer

 from distiller import Distiller
 from utils import git_log, logger, init_gpu_params, set_seed
-from dataset import Dataset
+from lm_seqs_dataset import LmSeqsDataset


+MODEL_CLASSES = {
+    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
+    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
+    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
+    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)
+}
+
+def sanity_checks(args):
+    """
+    A bunch of args sanity checks to perform even starting...
+    """
+    assert (args.mlm and args.alpha_mlm > 0.) or (not args.mlm and args.alpha_mlm == 0.)
+    assert (args.alpha_mlm > 0. and args.alpha_clm == 0.) or (args.alpha_mlm == 0. and args.alpha_clm > 0.)
+    if args.mlm:
+        assert os.path.isfile(args.token_counts)
+        assert (args.student_type in ['roberta', 'distilbert']) and (args.teacher_type in ['roberta', 'bert'])
+    else:
+        assert (args.student_type in ['gpt2']) and (args.teacher_type in ['gpt2'])
+
+    assert args.teacher_type == args.student_type or (args.student_type=='distilbert' and args.teacher_type=='bert')
+    assert os.path.isfile(args.student_config)
+    if args.student_pretrained_weights is not None:
+        assert os.path.isfile(args.student_pretrained_weights)
+
+    if args.freeze_token_type_embds: assert args.student_type in ['roberta']
+
+    assert args.alpha_ce >= 0.
+    assert args.alpha_mlm >= 0.
+    assert args.alpha_clm >= 0.
+    assert args.alpha_mse >= 0.
+    assert args.alpha_cos >= 0.
+    assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.
+
+def freeze_pos_embeddings(student, args):
+    if args.student_type == 'roberta':
+        student.roberta.embeddings.position_embeddings.weight.requires_grad = False
+    elif args.student_type == 'gpt2':
+        student.transformer.wpe.weight.requires_grad = False
+
+def freeze_token_type_embeddings(student, args):
+    if args.student_type == 'roberta':
+        student.roberta.embeddings.token_type_embeddings.weight.requires_grad = False
+
 def main():
    parser = argparse.ArgumentParser(description="Training")
+    parser.add_argument("--force", action='store_true',
+                        help="Overwrite dump_path if it already exists.")

    parser.add_argument("--dump_path", type=str, required=True,
                        help="The output directory (log, checkpoints, parameters, etc.)")
    parser.add_argument("--data_file", type=str, required=True,
                        help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.")
-    parser.add_argument("--token_counts", type=str, required=True,
-                        help="The token counts in the data_file for MLM.")
-    parser.add_argument("--force", action='store_true',
-                        help="Overwrite dump_path if it already exists.")

-    parser.add_argument("--vocab_size", default=30522, type=int,
-                        help="The vocabulary size.")
-    parser.add_argument("--max_position_embeddings", default=512, type=int,
-                        help="Maximum sequence length we can model (including [CLS] and [SEP]).")
-    parser.add_argument("--sinusoidal_pos_embds", action='store_false',
-                        help="If true, the position embeddings are simply fixed with sinusoidal embeddings.")
-    parser.add_argument("--n_layers", default=6, type=int,
-                        help="Number of Transformer blocks.")
-    parser.add_argument("--n_heads", default=12, type=int,
-                        help="Number of heads in the self-attention module.")
-    parser.add_argument("--dim", default=768, type=int,
-                        help="Dimension through the network. Must be divisible by n_heads")
-    parser.add_argument("--hidden_dim", default=3072, type=int,
-                        help="Intermediate dimension in the FFN.")
-    parser.add_argument("--dropout", default=0.1, type=float,
-                        help="Dropout.")
-    parser.add_argument("--attention_dropout", default=0.1, type=float,
-                        help="Dropout in self-attention.")
-    parser.add_argument("--activation", default='gelu', type=str,
-                        help="Activation to use in self-attention")
-    parser.add_argument("--tie_weights_", action='store_false',
-                        help="If true, we tie the embeddings matrix with the projection over the vocabulary matrix. Default is true.")
-
-    parser.add_argument("--from_pretrained_weights", default=None, type=str,
+    parser.add_argument("--student_type", type=str, choices=["distilbert", "roberta", "gpt2"], required=True,
+                        help="The student type (DistilBERT, RoBERTa).")
+    parser.add_argument("--student_config", type=str, required=True,
+                        help="Path to the student configuration.")
+    parser.add_argument("--student_pretrained_weights", default=None, type=str,
                        help="Load student initialization checkpoint.")
-    parser.add_argument("--from_pretrained_config", default=None, type=str,
-                        help="Load student initialization architecture config.")
-    parser.add_argument("--bert_model", default='bert-base-uncased', type=str,
-                        help="The teacher BERT model.")
+
+    parser.add_argument("--teacher_type", choices=["bert", "roberta", "gpt2"], required=True,
+                        help="Teacher type (BERT, RoBERTa).")
+    parser.add_argument("--teacher_name", type=str, required=True,
+                        help="The teacher model.")

    parser.add_argument("--temperature", default=2., type=float,
                        help="Temperature for the softmax temperature.")
    parser.add_argument("--alpha_ce", default=0.5, type=float,
                        help="Linear weight for the distillation loss. Must be >=0.")
-    parser.add_argument("--alpha_mlm", default=0.5, type=float,
-                        help="Linear weight for the MLM loss. Must be >=0.")
+    parser.add_argument("--alpha_mlm", default=0.0, type=float,
+                        help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.")
+    parser.add_argument("--alpha_clm", default=0.5, type=float,
+                        help="Linear weight for the CLM loss. Must be >=0.")
    parser.add_argument("--alpha_mse", default=0.0, type=float,
                        help="Linear weight of the MSE loss. Must be >=0.")
+    parser.add_argument("--alpha_cos", default=0.0, type=float,
+                        help="Linear weight of the cosine embedding loss. Must be >=0.")
+
+    parser.add_argument("--mlm", action="store_true",
+                        help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM.")
    parser.add_argument("--mlm_mask_prop", default=0.15, type=float,
                        help="Proportion of tokens for which we need to make a prediction.")
    parser.add_argument("--word_mask", default=0.8, type=float,
@@ -91,17 +124,20 @@ def main():
                        help="Proportion of tokens to randomly replace.")
    parser.add_argument("--mlm_smoothing", default=0.7, type=float,
                        help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).")
+    parser.add_argument("--token_counts", type=str,
+                        help="The token counts in the data_file for MLM.")
+
    parser.add_argument("--restrict_ce_to_mask", action='store_true',
                        help="If true, compute the distilation loss only the [MLM] prediction distribution.")
+    parser.add_argument("--freeze_pos_embs", action="store_true",
+                        help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.")
+    parser.add_argument("--freeze_token_type_embds", action="store_true",
+                        help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.")

    parser.add_argument("--n_epoch", type=int, default=3,
                        help="Number of pass on the whole dataset.")
    parser.add_argument("--batch_size", type=int, default=5,
                        help="Batch size (for each process).")
-    parser.add_argument("--tokens_per_batch", type=int, default=-1,
-                        help="If specified, modify the batches so that they have approximately this number of tokens.")
-    parser.add_argument("--shuffle", action='store_false',
-                        help="If true, shuffle the sequence order. Default is true.")
    parser.add_argument("--group_by_size", action='store_false',
                        help="If true, group sequences that have similar length into the same batch. Default is true.")

@@ -137,6 +173,7 @@ def main():
    parser.add_argument("--checkpoint_interval", type=int, default=4000,
                        help="Checkpoint interval.")
    args = parser.parse_args()
+    sanity_checks(args)


    ## ARGS ##
@@ -160,18 +197,19 @@ def main():
        with open(os.path.join(args.dump_path, 'parameters.json'), 'w') as f:
            json.dump(vars(args), f, indent=4)
        git_log(args.dump_path)
-    assert (args.from_pretrained_weights is None and args.from_pretrained_config is None) or \
-           (args.from_pretrained_weights is not None and args.from_pretrained_config is not None)

+    student_config_class, student_model_class, _ = MODEL_CLASSES[args.student_type]
+    teacher_config_class, teacher_model_class, teacher_tokenizer_class = MODEL_CLASSES[args.teacher_type]

    ### TOKENIZER ###
-    bert_tokenizer = BertTokenizer.from_pretrained(args.bert_model)
+    tokenizer = teacher_tokenizer_class.from_pretrained(args.teacher_name)
    special_tok_ids = {}
-    for tok_name, tok_symbol in bert_tokenizer.special_tokens_map.items():
-        idx = bert_tokenizer.all_special_tokens.index(tok_symbol)
-        special_tok_ids[tok_name] = bert_tokenizer.all_special_ids[idx]
+    for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
+        idx = tokenizer.all_special_tokens.index(tok_symbol)
+        special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
    logger.info(f'Special tokens {special_tok_ids}')
    args.special_tok_ids = special_tok_ids
+    args.max_model_input_size = tokenizer.max_model_input_sizes[args.teacher_name]


    ## DATA LOADER ##
@@ -180,34 +218,34 @@ def main():
        data = pickle.load(fp)


-    assert os.path.isfile(args.token_counts)
-    logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)')
-    with open(args.token_counts, 'rb') as fp:
-        counts = pickle.load(fp)
-        assert len(counts) == args.vocab_size
-    token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
-    for idx in special_tok_ids.values():
-        token_probs[idx] = 0.  # do not predict special tokens
-    token_probs = torch.from_numpy(token_probs)
+    if args.mlm:
+        logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)')
+        with open(args.token_counts, 'rb') as fp:
+            counts = pickle.load(fp)
+        
+        token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
+        for idx in special_tok_ids.values():
+            token_probs[idx] = 0.  # do not predict special tokens
+        token_probs = torch.from_numpy(token_probs)
+    else:
+        token_probs = None


-    train_dataloader = Dataset(params=args, data=data)
+    train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
    logger.info(f'Data loader created.')


    ## STUDENT ##
-    if args.from_pretrained_weights is not None:
-        assert os.path.isfile(os.path.join(args.from_pretrained_weights))
-        assert os.path.isfile(os.path.join(args.from_pretrained_config))
-        logger.info(f'Loading pretrained weights from {args.from_pretrained_weights}')
-        logger.info(f'Loading pretrained config from {args.from_pretrained_config}')
-        stu_architecture_config = DistilBertConfig.from_json_file(args.from_pretrained_config)
-        student = DistilBertForMaskedLM.from_pretrained(args.from_pretrained_weights,
-                                                     config=stu_architecture_config)
+    logger.info(f'Loading student config from {args.student_config}')
+    stu_architecture_config = student_config_class.from_pretrained(args.student_config)
+    stu_architecture_config.output_hidden_states = True
+
+    if args.student_pretrained_weights is not None:
+        logger.info(f'Loading pretrained weights from {args.student_pretrained_weights}')
+        student = student_model_class.from_pretrained(args.student_pretrained_weights,
+                                                      config=stu_architecture_config)
    else:
-        args.vocab_size_or_config_json_file = args.vocab_size
-        stu_architecture_config = DistilBertConfig(**vars(args))
-        student = DistilBertForMaskedLM(stu_architecture_config)
+        student = student_model_class(stu_architecture_config)


    if args.n_gpu > 0:
@@ -216,15 +254,31 @@ def main():


    ## TEACHER ##
-    teacher = BertForMaskedLM.from_pretrained(args.bert_model)
+    teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
    if args.n_gpu > 0:
        teacher.to(f'cuda:{args.local_rank}')
-    logger.info(f'Teacher loaded from {args.bert_model}.')
+    logger.info(f'Teacher loaded from {args.teacher_name}.')
+
+
+    ## FREEZING ##
+    if args.freeze_pos_embs:
+        freeze_pos_embeddings(student, args)
+    if args.freeze_token_type_embds:
+        freeze_token_type_embeddings(student, args)
+
+
+    ## SANITY CHECKS ##
+    assert student.config.vocab_size == teacher.config.vocab_size
+    assert student.config.hidden_size == teacher.config.hidden_size
+    assert student.config.max_position_embeddings == teacher.config.max_position_embeddings
+    if args.mlm:
+        assert token_probs.size(0) == stu_architecture_config.vocab_size
+

    ## DISTILLER ##
    torch.cuda.empty_cache()
    distiller = Distiller(params=args,
-                          dataloader=train_dataloader,
+                          dataset=train_lm_seq_dataset,
                          token_probs=token_probs,
                          student=student,
                          teacher=teacher)
--- a/examples/distillation/training_configs/distilbert-base-uncased.json
+++ b/examples/distillation/training_configs/distilbert-base-uncased.json
@@ -0,0 +1,15 @@
+{
+	"activation": "gelu",
+	"attention_dropout": 0.1,
+	"dim": 768,
+	"dropout": 0.1,
+	"hidden_dim": 3072,
+	"initializer_range": 0.02,
+	"max_position_embeddings": 512,
+	"n_heads": 12,
+	"n_layers": 6,
+	"sinusoidal_pos_embds": true,
+	"tie_weights_": true,
+	"vocab_size": 30522
+  }
+  
--- a/examples/distillation/training_configs/distilgpt2.json
+++ b/examples/distillation/training_configs/distilgpt2.json
@@ -0,0 +1,10 @@
+{
+	"initializer_range": 0.02,
+	"layer_norm_epsilon": 0.00001,
+	"n_ctx": 1024,
+	"n_embd": 768,
+	"n_head": 12,
+	"n_layer": 6,
+	"n_positions": 1024,
+	"vocab_size": 50257
+}
--- a/examples/lm_finetuning/README.md
+++ b/examples/lm_finetuning/README.md
@@ -1,64 +0,0 @@
-# BERT Model Finetuning using Masked Language Modeling objective
-
-## Introduction
-
-The three example scripts in this folder can be used to **fine-tune** a pre-trained BERT model using the pretraining objective (combination of masked language modeling and next sentence prediction loss). In general, pretrained models like BERT are first trained with a pretraining objective (masked language modeling and next sentence prediction for BERT) on a large and general natural language corpus. A classifier head is then added on top of the pre-trained architecture and the model is quickly fine-tuned on a target task, while still (hopefully) retaining its general language understanding. This greatly reduces overfitting and yields state-of-the-art results, especially when training data for the target task are limited.
-
-The [ULMFiT paper](https://arxiv.org/abs/1801.06146) took a slightly different approach, however, and added an intermediate step in which the model is fine-tuned on text **from the same domain as the target task and using the pretraining objective** before the final stage in which the classifier head is added and the model is trained on the target task itself. This paper reported significantly improved results from this step, and found that they could get high-quality classifications even with only tiny numbers (<1000) of labelled training examples, as long as they had a lot of unlabelled data from the target domain.
-
-Although this wasn't covered in the original BERT paper, domain-specific fine-tuning of Transformer models has [recently been reported by other authors](https://arxiv.org/pdf/1905.05583.pdf), and they report performance improvements as well.
-
-## Input format
-
-The scripts in this folder expect a single file as input, consisting of untokenized text, with one **sentence** per line, and one blank line between documents. The reason for the sentence splitting is that part of BERT's training involves a _next sentence_ objective in which the model must predict whether two sequences of text are contiguous text from the same document or not, and to avoid making the task _too easy_, the split point between the sequences is always at the end of a sentence. The linebreaks in the file are therefore necessary to mark the points where the text can be split.
-
-## Usage
-
-There are two ways to fine-tune a language model using these scripts. The first _quick_ approach is to use [`simple_lm_finetuning.py`](./simple_lm_finetuning.py). This script does everything in a single script, but generates training instances that consist of just two sentences. This is quite different from the BERT paper, where (confusingly) the NextSentence task concatenated sentences together from each document to form two long multi-sentences, which the paper just referred to as _sentences_. The difference between this simple approach and the original paper approach can have a significant effect for long sequences since two sentences will be much shorter than the max sequence length. In this case, most of each training example will just consist of blank padding characters, which wastes a lot of computation and results in a model that isn't really training on long sequences.
-
-As such, the preferred approach (assuming you have documents containing multiple contiguous sentences from your target domain) is to use [`pregenerate_training_data.py`](./pregenerate_training_data.py) to pre-process your data into training examples following the methodology used for LM training in the original BERT paper and repository. Since there is a significant random component to training data generation for BERT, this script includes an option to generate multiple _epochs_ of pre-processed data, to avoid training on the same random splits each epoch. Generating an epoch of data for each training epoch should result a better final model, and so we recommend doing so.
-
-You can then train on the pregenerated data using [`finetune_on_pregenerated.py`](./finetune_on_pregenerated.py), and pointing it to the folder created by [`pregenerate_training_data.py`](./pregenerate_training_data.py). Note that you should use the same `bert_model` and case options for both! Also note that `max_seq_len` does not need to be specified for the [`finetune_on_pregenerated.py`](./finetune_on_pregenerated.py) script, as it is inferred from the training examples.
-
-There are various options that can be tweaked, but they are mostly set to the values from the BERT paper/repository and default values should make sense. The most relevant ones are:
-
- `--max_seq_len`: Controls the length of training examples (in wordpiece tokens) seen by the model. Defaults to 128 but can be set as high as 512. Higher values may yield stronger language models at the cost of slower and more memory-intensive training.
- `--fp16`: Enables fast half-precision training on recent GPUs.
-
-In addition, if memory usage is an issue, especially when training on a single GPU, reducing `--train_batch_size` from the default 32 to a lower number (4-16) can be helpful, or leaving `--train_batch_size` at the default and increasing `--gradient_accumulation_steps` to 2-8. Changing `--gradient_accumulation_steps` may be preferable as alterations to the batch size may require corresponding changes in the learning rate to compensate. There is also a `--reduce_memory` option for both the `pregenerate_training_data.py` and `finetune_on_pregenerated.py` scripts that spills data to disc in shelf objects or numpy memmaps rather than retaining it in memory, which significantly reduces memory usage with little performance impact.
-
-## Examples
-
-### Simple fine-tuning
-
-```
-python3 simple_lm_finetuning.py 
--train_corpus my_corpus.txt 
--bert_model bert-base-uncased 
--do_lower_case 
--output_dir finetuned_lm/
--do_train
-```
-
-### Pregenerating training data
-
-```
-python3 pregenerate_training_data.py
--train_corpus my_corpus.txt
--bert_model bert-base-uncased
--do_lower_case
--output_dir training/
--epochs_to_generate 3
--max_seq_len 256
-```
-
-### Training on pregenerated data
-
-```
-python3 finetune_on_pregenerated.py
--pregenerated_data training/
--bert_model bert-base-uncased
--do_lower_case
--output_dir finetuned_lm/
--epochs 3
-```
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -1,346 +0,0 @@
-from argparse import ArgumentParser
-from pathlib import Path
-import os
-import torch
-import logging
-import json
-import random
-import numpy as np
-from collections import namedtuple
-from tempfile import TemporaryDirectory
-
-from torch.utils.data import DataLoader, Dataset, RandomSampler
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm
-
-from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_transformers.modeling_bert import BertForPreTraining
-from pytorch_transformers.tokenization_bert import BertTokenizer
-from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
-
-InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")
-
-log_format = '%(asctime)-10s: %(message)s'
-logging.basicConfig(level=logging.INFO, format=log_format)
-
-
-def convert_example_to_features(example, tokenizer, max_seq_length):
-    tokens = example["tokens"]
-    segment_ids = example["segment_ids"]
-    is_random_next = example["is_random_next"]
-    masked_lm_positions = example["masked_lm_positions"]
-    masked_lm_labels = example["masked_lm_labels"]
-
-    assert len(tokens) == len(segment_ids) <= max_seq_length  # The preprocessed data should be already truncated
-    input_ids = tokenizer.convert_tokens_to_ids(tokens)
-    masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels)
-
-    input_array = np.zeros(max_seq_length, dtype=np.int)
-    input_array[:len(input_ids)] = input_ids
-
-    mask_array = np.zeros(max_seq_length, dtype=np.bool)
-    mask_array[:len(input_ids)] = 1
-
-    segment_array = np.zeros(max_seq_length, dtype=np.bool)
-    segment_array[:len(segment_ids)] = segment_ids
-
-    lm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1)
-    lm_label_array[masked_lm_positions] = masked_label_ids
-
-    features = InputFeatures(input_ids=input_array,
-                             input_mask=mask_array,
-                             segment_ids=segment_array,
-                             lm_label_ids=lm_label_array,
-                             is_next=is_random_next)
-    return features
-
-
-class PregeneratedDataset(Dataset):
-    def __init__(self, training_path, epoch, tokenizer, num_data_epochs, reduce_memory=False):
-        self.vocab = tokenizer.vocab
-        self.tokenizer = tokenizer
-        self.epoch = epoch
-        self.data_epoch = epoch % num_data_epochs
-        data_file = training_path / f"epoch_{self.data_epoch}.json"
-        metrics_file = training_path / f"epoch_{self.data_epoch}_metrics.json"
-        assert data_file.is_file() and metrics_file.is_file()
-        metrics = json.loads(metrics_file.read_text())
-        num_samples = metrics['num_training_examples']
-        seq_len = metrics['max_seq_len']
-        self.temp_dir = None
-        self.working_dir = None
-        if reduce_memory:
-            self.temp_dir = TemporaryDirectory()
-            self.working_dir = Path(self.temp_dir.name)
-            input_ids = np.memmap(filename=self.working_dir/'input_ids.memmap',
-                                  mode='w+', dtype=np.int32, shape=(num_samples, seq_len))
-            input_masks = np.memmap(filename=self.working_dir/'input_masks.memmap',
-                                    shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
-            segment_ids = np.memmap(filename=self.working_dir/'segment_ids.memmap',
-                                    shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
-            lm_label_ids = np.memmap(filename=self.working_dir/'lm_label_ids.memmap',
-                                     shape=(num_samples, seq_len), mode='w+', dtype=np.int32)
-            lm_label_ids[:] = -1
-            is_nexts = np.memmap(filename=self.working_dir/'is_nexts.memmap',
-                                 shape=(num_samples,), mode='w+', dtype=np.bool)
-        else:
-            input_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.int32)
-            input_masks = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)
-            segment_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)
-            lm_label_ids = np.full(shape=(num_samples, seq_len), dtype=np.int32, fill_value=-1)
-            is_nexts = np.zeros(shape=(num_samples,), dtype=np.bool)
-        logging.info(f"Loading training examples for epoch {epoch}")
-        with data_file.open() as f:
-            for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")):
-                line = line.strip()
-                example = json.loads(line)
-                features = convert_example_to_features(example, tokenizer, seq_len)
-                input_ids[i] = features.input_ids
-                segment_ids[i] = features.segment_ids
-                input_masks[i] = features.input_mask
-                lm_label_ids[i] = features.lm_label_ids
-                is_nexts[i] = features.is_next
-        assert i == num_samples - 1  # Assert that the sample count metric was true
-        logging.info("Loading complete!")
-        self.num_samples = num_samples
-        self.seq_len = seq_len
-        self.input_ids = input_ids
-        self.input_masks = input_masks
-        self.segment_ids = segment_ids
-        self.lm_label_ids = lm_label_ids
-        self.is_nexts = is_nexts
-
-    def __len__(self):
-        return self.num_samples
-
-    def __getitem__(self, item):
-        return (torch.tensor(self.input_ids[item].astype(np.int64)),
-                torch.tensor(self.input_masks[item].astype(np.int64)),
-                torch.tensor(self.segment_ids[item].astype(np.int64)),
-                torch.tensor(self.lm_label_ids[item].astype(np.int64)),
-                torch.tensor(self.is_nexts[item].astype(np.int64)))
-
-
-def main():
-    parser = ArgumentParser()
-    parser.add_argument('--pregenerated_data', type=Path, required=True)
-    parser.add_argument('--output_dir', type=Path, required=True)
-    parser.add_argument("--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
-    parser.add_argument("--do_lower_case", action="store_true")
-    parser.add_argument("--reduce_memory", action="store_true",
-                        help="Store training data as on-disc memmaps to massively reduce memory usage")
-
-    parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--gradient_accumulation_steps',
-                        type=int,
-                        default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--train_batch_size",
-                        default=32,
-                        type=int,
-                        help="Total batch size for training.")
-    parser.add_argument('--fp16',
-                        action='store_true',
-                        help="Whether to use 16-bit float precision instead of 32-bit")
-    parser.add_argument('--loss_scale',
-                        type=float, default=0,
-                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                        "0 (default value): dynamic loss scaling.\n"
-                        "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument("--warmup_steps", 
-                        default=0, 
-                        type=int,
-                        help="Linear warmup over warmup_steps.")
-    parser.add_argument("--adam_epsilon", 
-                        default=1e-8, 
-                        type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--learning_rate",
-                        default=3e-5,
-                        type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument('--seed',
-                        type=int,
-                        default=42,
-                        help="random seed for initialization")
-    args = parser.parse_args()
-
-    assert args.pregenerated_data.is_dir(), \
-        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"
-
-    samples_per_epoch = []
-    for i in range(args.epochs):
-        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
-        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
-        if epoch_file.is_file() and metrics_file.is_file():
-            metrics = json.loads(metrics_file.read_text())
-            samples_per_epoch.append(metrics['num_training_examples'])
-        else:
-            if i == 0:
-                exit("No training data was found!")
-            print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).")
-            print("This script will loop over the available data, but training diversity may be negatively impacted.")
-            num_data_epochs = i
-            break
-    else:
-        num_data_epochs = args.epochs
-
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.distributed.init_process_group(backend='nccl')
-    logging.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        device, n_gpu, bool(args.local_rank != -1), args.fp16))
-
-    if args.gradient_accumulation_steps < 1:
-        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-                            args.gradient_accumulation_steps))
-
-    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
-        logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!")
-    args.output_dir.mkdir(parents=True, exist_ok=True)
-
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-
-    total_train_examples = 0
-    for i in range(args.epochs):
-        # The modulo takes into account the fact that we may loop over limited epochs of data
-        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]
-
-    num_train_optimization_steps = int(
-        total_train_examples / args.train_batch_size / args.gradient_accumulation_steps)
-    if args.local_rank != -1:
-        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
-
-    # Prepare model
-    model = BertForPreTraining.from_pretrained(args.bert_model)
-    # We don't need to manually call model.half() following Apex's recommend
-    # if args.fp16:
-    #     model.half()
-    model.to(device)
-    if args.local_rank != -1:
-        try:
-            from apex.parallel import DistributedDataParallel as DDP
-        except ImportError:
-            raise ImportError(
-                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-        model = DDP(model)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Prepare optimizer
-    param_optimizer = list(model.named_parameters())
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-    optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
-         'weight_decay': 0.01},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-    ]
-
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps,
-                                     t_total=num_train_optimization_steps)
-
-    if args.fp16:
-        try:
-            # from apex.optimizers import FP16_Optimizer
-            # from apex.optimizers import FusedAdam
-            from apex import amp
-        except ImportError:
-            raise ImportError(
-                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-        # This below line of code is the main upgrade of Apex Fp16 implementation. I chose opt_leve="01"
-        # because it's recommended for typical use by Apex. We can make it configured
-        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
-
-    # We don't need to use FP16_Optimizer wrapping over FusedAdam as well. Now Apex supports all Pytorch Optimizer
-
-    #     optimizer = FusedAdam(optimizer_grouped_parameters,
-    #                           lr=args.learning_rate,
-    #                           bias_correction=False,
-    #                           max_grad_norm=1.0)
-    #     if args.loss_scale == 0:
-    #         optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-    #     else:
-    #         optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-    # else:
-    #     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)
-
-    global_step = 0
-    logging.info("***** Running training *****")
-    logging.info(f"  Num examples = {total_train_examples}")
-    logging.info("  Batch size = %d", args.train_batch_size)
-    logging.info("  Num steps = %d", num_train_optimization_steps)
-    model.train()
-    for epoch in range(args.epochs):
-        epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer,
-                                            num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(epoch_dataset)
-        else:
-            train_sampler = DistributedSampler(epoch_dataset)
-        train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-        tr_loss = 0
-        nb_tr_examples, nb_tr_steps = 0, 0
-        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
-            for step, batch in enumerate(train_dataloader):
-                batch = tuple(t.to(device) for t in batch)
-                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
-                outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
-                loss = outputs[0]
-                if n_gpu > 1:
-                    loss = loss.mean() # mean() to average on multi-gpu.
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-                if args.fp16:
-                    # I depricate FP16_Optimizer's backward func and replace as Apex document
-                    # optimizer.backward(loss)
-                    with amp.scale_loss(loss, optimizer) as scaled_loss:
-                        scaled_loss.backward()
-                else:
-                    loss.backward()
-                tr_loss += loss.item()
-                nb_tr_examples += input_ids.size(0)
-                nb_tr_steps += 1
-                pbar.update(1)
-                mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
-                pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    optimizer.step()
-                    scheduler.step()  # Update learning rate schedule
-                    optimizer.zero_grad()
-                    global_step += 1
-
-    # Save a trained model
-    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-        logging.info("** ** * Saving fine-tuned model ** ** * ")
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-
-if __name__ == '__main__':
-    main()
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -1,354 +0,0 @@
-from argparse import ArgumentParser
-from pathlib import Path
-from tqdm import tqdm, trange
-from tempfile import TemporaryDirectory
-import shelve
-from multiprocessing import Pool
-
-from random import random, randrange, randint, shuffle, choice
-from pytorch_transformers.tokenization_bert import BertTokenizer
-import numpy as np
-import json
-import collections
-
-class DocumentDatabase:
-    def __init__(self, reduce_memory=False):
-        if reduce_memory:
-            self.temp_dir = TemporaryDirectory()
-            self.working_dir = Path(self.temp_dir.name)
-            self.document_shelf_filepath = self.working_dir / 'shelf.db'
-            self.document_shelf = shelve.open(str(self.document_shelf_filepath),
-                                              flag='n', protocol=-1)
-            self.documents = None
-        else:
-            self.documents = []
-            self.document_shelf = None
-            self.document_shelf_filepath = None
-            self.temp_dir = None
-        self.doc_lengths = []
-        self.doc_cumsum = None
-        self.cumsum_max = None
-        self.reduce_memory = reduce_memory
-
-    def add_document(self, document):
-        if not document:
-            return
-        if self.reduce_memory:
-            current_idx = len(self.doc_lengths)
-            self.document_shelf[str(current_idx)] = document
-        else:
-            self.documents.append(document)
-        self.doc_lengths.append(len(document))
-
-    def _precalculate_doc_weights(self):
-        self.doc_cumsum = np.cumsum(self.doc_lengths)
-        self.cumsum_max = self.doc_cumsum[-1]
-
-    def sample_doc(self, current_idx, sentence_weighted=True):
-        # Uses the current iteration counter to ensure we don't sample the same doc twice
-        if sentence_weighted:
-            # With sentence weighting, we sample docs proportionally to their sentence length
-            if self.doc_cumsum is None or len(self.doc_cumsum) != len(self.doc_lengths):
-                self._precalculate_doc_weights()
-            rand_start = self.doc_cumsum[current_idx]
-            rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
-            sentence_index = randrange(rand_start, rand_end) % self.cumsum_max
-            sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
-        else:
-            # If we don't use sentence weighting, then every doc has an equal chance to be chosen
-            sampled_doc_index = (current_idx + randrange(1, len(self.doc_lengths))) % len(self.doc_lengths)
-        assert sampled_doc_index != current_idx
-        if self.reduce_memory:
-            return self.document_shelf[str(sampled_doc_index)]
-        else:
-            return self.documents[sampled_doc_index]
-
-    def __len__(self):
-        return len(self.doc_lengths)
-
-    def __getitem__(self, item):
-        if self.reduce_memory:
-            return self.document_shelf[str(item)]
-        else:
-            return self.documents[item]
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, traceback):
-        if self.document_shelf is not None:
-            self.document_shelf.close()
-        if self.temp_dir is not None:
-            self.temp_dir.cleanup()
-
-
-def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
-    """Truncates a pair of sequences to a maximum sequence length. Lifted from Google's BERT repo."""
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_num_tokens:
-            break
-
-        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
-        assert len(trunc_tokens) >= 1
-
-        # We want to sometimes truncate from the front and sometimes from the
-        # back to add more randomness and avoid biases.
-        if random() < 0.5:
-            del trunc_tokens[0]
-        else:
-            trunc_tokens.pop()
-
-MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
-                                          ["index", "label"])
-
-def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):
-    """Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but
-    with several refactors to clean it up and remove a lot of unnecessary variables."""
-    cand_indices = []
-    for (i, token) in enumerate(tokens):
-        if token == "[CLS]" or token == "[SEP]":
-            continue
-        # Whole Word Masking means that if we mask all of the wordpieces
-        # corresponding to an original word. When a word has been split into
-        # WordPieces, the first token does not have any marker and any subsequence
-        # tokens are prefixed with ##. So whenever we see the ## token, we
-        # append it to the previous set of word indexes.
-        #
-        # Note that Whole Word Masking does *not* change the training code
-        # at all -- we still predict each WordPiece independently, softmaxed
-        # over the entire vocabulary.
-        if (whole_word_mask and len(cand_indices) >= 1 and token.startswith("##")):
-            cand_indices[-1].append(i)
-        else:
-            cand_indices.append([i])
-
-    num_to_mask = min(max_predictions_per_seq,
-                      max(1, int(round(len(tokens) * masked_lm_prob))))
-    shuffle(cand_indices)
-    masked_lms = []
-    covered_indexes = set()
-    for index_set in cand_indices:
-        if len(masked_lms) >= num_to_mask:
-            break
-        # If adding a whole-word mask would exceed the maximum number of
-        # predictions, then just skip this candidate.
-        if len(masked_lms) + len(index_set) > num_to_mask:
-            continue
-        is_any_index_covered = False
-        for index in index_set:
-            if index in covered_indexes:
-                is_any_index_covered = True
-                break
-        if is_any_index_covered:
-            continue
-        for index in index_set:
-            covered_indexes.add(index)
-
-            masked_token = None
-            # 80% of the time, replace with [MASK]
-            if random() < 0.8:
-                masked_token = "[MASK]"
-            else:
-                # 10% of the time, keep original
-                if random() < 0.5:
-                    masked_token = tokens[index]
-                # 10% of the time, replace with random word
-                else:
-                    masked_token = choice(vocab_list)
-            masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
-            tokens[index] = masked_token
-
-    assert len(masked_lms) <= num_to_mask
-    masked_lms = sorted(masked_lms, key=lambda x: x.index)
-    mask_indices = [p.index for p in masked_lms]
-    masked_token_labels = [p.label for p in masked_lms]
-
-    return tokens, mask_indices, masked_token_labels
-
-
-def create_instances_from_document(
-        doc_database, doc_idx, max_seq_length, short_seq_prob,
-        masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):
-    """This code is mostly a duplicate of the equivalent function from Google BERT's repo.
-    However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function.
-    Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence
-    (rather than each document) has an equal chance of being sampled as a false example for the NextSentence task."""
-    document = doc_database[doc_idx]
-    # Account for [CLS], [SEP], [SEP]
-    max_num_tokens = max_seq_length - 3
-
-    # We *usually* want to fill up the entire sequence since we are padding
-    # to `max_seq_length` anyways, so short sequences are generally wasted
-    # computation. However, we *sometimes*
-    # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
-    # sequences to minimize the mismatch between pre-training and fine-tuning.
-    # The `target_seq_length` is just a rough target however, whereas
-    # `max_seq_length` is a hard limit.
-    target_seq_length = max_num_tokens
-    if random() < short_seq_prob:
-        target_seq_length = randint(2, max_num_tokens)
-
-    # We DON'T just concatenate all of the tokens from a document into a long
-    # sequence and choose an arbitrary split point because this would make the
-    # next sentence prediction task too easy. Instead, we split the input into
-    # segments "A" and "B" based on the actual "sentences" provided by the user
-    # input.
-    instances = []
-    current_chunk = []
-    current_length = 0
-    i = 0
-    while i < len(document):
-        segment = document[i]
-        current_chunk.append(segment)
-        current_length += len(segment)
-        if i == len(document) - 1 or current_length >= target_seq_length:
-            if current_chunk:
-                # `a_end` is how many segments from `current_chunk` go into the `A`
-                # (first) sentence.
-                a_end = 1
-                if len(current_chunk) >= 2:
-                    a_end = randrange(1, len(current_chunk))
-
-                tokens_a = []
-                for j in range(a_end):
-                    tokens_a.extend(current_chunk[j])
-
-                tokens_b = []
-
-                # Random next
-                if len(current_chunk) == 1 or random() < 0.5:
-                    is_random_next = True
-                    target_b_length = target_seq_length - len(tokens_a)
-
-                    # Sample a random document, with longer docs being sampled more frequently
-                    random_document = doc_database.sample_doc(current_idx=doc_idx, sentence_weighted=True)
-
-                    random_start = randrange(0, len(random_document))
-                    for j in range(random_start, len(random_document)):
-                        tokens_b.extend(random_document[j])
-                        if len(tokens_b) >= target_b_length:
-                            break
-                    # We didn't actually use these segments so we "put them back" so
-                    # they don't go to waste.
-                    num_unused_segments = len(current_chunk) - a_end
-                    i -= num_unused_segments
-                # Actual next
-                else:
-                    is_random_next = False
-                    for j in range(a_end, len(current_chunk)):
-                        tokens_b.extend(current_chunk[j])
-                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)
-
-                assert len(tokens_a) >= 1
-                assert len(tokens_b) >= 1
-
-                tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]
-                # The segment IDs are 0 for the [CLS] token, the A tokens and the first [SEP]
-                # They are 1 for the B tokens and the final [SEP]
-                segment_ids = [0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)]
-
-                tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions(
-                    tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list)
-
-                instance = {
-                    "tokens": tokens,
-                    "segment_ids": segment_ids,
-                    "is_random_next": is_random_next,
-                    "masked_lm_positions": masked_lm_positions,
-                    "masked_lm_labels": masked_lm_labels}
-                instances.append(instance)
-            current_chunk = []
-            current_length = 0
-        i += 1
-
-    return instances
-
-
-def create_training_file(docs, vocab_list, args, epoch_num):
-    epoch_filename = args.output_dir / "epoch_{}.json".format(epoch_num)
-    num_instances = 0
-    with epoch_filename.open('w') as epoch_file:
-        for doc_idx in trange(len(docs), desc="Document"):
-            doc_instances = create_instances_from_document(
-                docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob,
-                masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq,
-                whole_word_mask=args.do_whole_word_mask, vocab_list=vocab_list)
-            doc_instances = [json.dumps(instance) for instance in doc_instances]
-            for instance in doc_instances:
-                epoch_file.write(instance + '\n')
-                num_instances += 1
-    metrics_file = args.output_dir / "epoch_{}_metrics.json".format(epoch_num)
-    with metrics_file.open('w') as metrics_file:
-        metrics = {
-            "num_training_examples": num_instances,
-            "max_seq_len": args.max_seq_len
-        }
-        metrics_file.write(json.dumps(metrics))
-
-
-def main():
-    parser = ArgumentParser()
-    parser.add_argument('--train_corpus', type=Path, required=True)
-    parser.add_argument("--output_dir", type=Path, required=True)
-    parser.add_argument("--bert_model", type=str, required=True,
-                        choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
-                                 "bert-base-multilingual-uncased", "bert-base-chinese", "bert-base-multilingual-cased"])
-    parser.add_argument("--do_lower_case", action="store_true")
-    parser.add_argument("--do_whole_word_mask", action="store_true",
-                        help="Whether to use whole word masking rather than per-WordPiece masking.")
-    parser.add_argument("--reduce_memory", action="store_true",
-                        help="Reduce memory usage for large datasets by keeping data on disc rather than in memory")
-
-    parser.add_argument("--num_workers", type=int, default=1,
-                        help="The number of workers to use to write the files")
-    parser.add_argument("--epochs_to_generate", type=int, default=3,
-                        help="Number of epochs of data to pregenerate")
-    parser.add_argument("--max_seq_len", type=int, default=128)
-    parser.add_argument("--short_seq_prob", type=float, default=0.1,
-                        help="Probability of making a short sentence as a training example")
-    parser.add_argument("--masked_lm_prob", type=float, default=0.15,
-                        help="Probability of masking each token for the LM task")
-    parser.add_argument("--max_predictions_per_seq", type=int, default=20,
-                        help="Maximum number of tokens to mask in each sequence")
-
-    args = parser.parse_args()
-
-    if args.num_workers > 1 and args.reduce_memory:
-        raise ValueError("Cannot use multiple workers while reducing memory")
-
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-    vocab_list = list(tokenizer.vocab.keys())
-    with DocumentDatabase(reduce_memory=args.reduce_memory) as docs:
-        with args.train_corpus.open() as f:
-            doc = []
-            for line in tqdm(f, desc="Loading Dataset", unit=" lines"):
-                line = line.strip()
-                if line == "":
-                    docs.add_document(doc)
-                    doc = []
-                else:
-                    tokens = tokenizer.tokenize(line)
-                    doc.append(tokens)
-            if doc:
-                docs.add_document(doc)  # If the last doc didn't end on a newline, make sure it still gets added
-        if len(docs) <= 1:
-            exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to "
-                 "ensure that random NextSentences are not sampled from the same document. Please add blank lines to "
-                 "indicate breaks between documents in your input file. If your dataset does not contain multiple "
-                 "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, "
-                 "sections or paragraphs.")
-
-        args.output_dir.mkdir(exist_ok=True)
-
-        if args.num_workers > 1:
-            writer_workers = Pool(min(args.num_workers, args.epochs_to_generate))
-            arguments = [(docs, vocab_list, args, idx) for idx in range(args.epochs_to_generate)]
-            writer_workers.starmap(create_training_file, arguments)
-        else:
-            for epoch in trange(args.epochs_to_generate, desc="Epoch"):
-                create_training_file(docs, vocab_list, args, epoch)
-
-
-if __name__ == '__main__':
-    main()
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -1,641 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT finetuning runner."""
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import argparse
-import logging
-import os
-import random
-from io import open
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, Dataset, RandomSampler
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_transformers.modeling_bert import BertForPreTraining
-from pytorch_transformers.tokenization_bert import BertTokenizer
-from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
-
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt='%m/%d/%Y %H:%M:%S',
-                    level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class BERTDataset(Dataset):
-    def __init__(self, corpus_path, tokenizer, seq_len, encoding="utf-8", corpus_lines=None, on_memory=True):
-        self.vocab = tokenizer.vocab
-        self.tokenizer = tokenizer
-        self.seq_len = seq_len
-        self.on_memory = on_memory
-        self.corpus_lines = corpus_lines  # number of non-empty lines in input corpus
-        self.corpus_path = corpus_path
-        self.encoding = encoding
-        self.current_doc = 0  # to avoid random sentence from same doc
-
-        # for loading samples directly from file
-        self.sample_counter = 0  # used to keep track of full epochs on file
-        self.line_buffer = None  # keep second sentence of a pair in memory and use as first sentence in next pair
-
-        # for loading samples in memory
-        self.current_random_doc = 0
-        self.num_docs = 0
-        self.sample_to_doc = [] # map sample index to doc and line
-
-        # load samples into memory
-        if on_memory:
-            self.all_docs = []
-            doc = []
-            self.corpus_lines = 0
-            with open(corpus_path, "r", encoding=encoding) as f:
-                for line in tqdm(f, desc="Loading Dataset", total=corpus_lines):
-                    line = line.strip()
-                    if line == "":
-                        self.all_docs.append(doc)
-                        doc = []
-                        #remove last added sample because there won't be a subsequent line anymore in the doc
-                        self.sample_to_doc.pop()
-                    else:
-                        #store as one sample
-                        sample = {"doc_id": len(self.all_docs),
-                                  "line": len(doc)}
-                        self.sample_to_doc.append(sample)
-                        doc.append(line)
-                        self.corpus_lines = self.corpus_lines + 1
-
-            # if last row in file is not empty
-            if self.all_docs[-1] != doc:
-                self.all_docs.append(doc)
-                self.sample_to_doc.pop()
-
-            self.num_docs = len(self.all_docs)
-
-        # load samples later lazily from disk
-        else:
-            if self.corpus_lines is None:
-                with open(corpus_path, "r", encoding=encoding) as f:
-                    self.corpus_lines = 0
-                    for line in tqdm(f, desc="Loading Dataset", total=corpus_lines):
-                        if line.strip() == "":
-                            self.num_docs += 1
-                        else:
-                            self.corpus_lines += 1
-
-                    # if doc does not end with empty line
-                    if line.strip() != "":
-                        self.num_docs += 1
-
-            self.file = open(corpus_path, "r", encoding=encoding)
-            self.random_file = open(corpus_path, "r", encoding=encoding)
-
-    def __len__(self):
-        # last line of doc won't be used, because there's no "nextSentence". Additionally, we start counting at 0.
-        return self.corpus_lines - self.num_docs - 1
-
-    def __getitem__(self, item):
-        cur_id = self.sample_counter
-        self.sample_counter += 1
-        if not self.on_memory:
-            # after one epoch we start again from beginning of file
-            if cur_id != 0 and (cur_id % len(self) == 0):
-                self.file.close()
-                self.file = open(self.corpus_path, "r", encoding=self.encoding)
-
-        t1, t2, is_next_label = self.random_sent(item)
-
-        # tokenize
-        tokens_a = self.tokenizer.tokenize(t1)
-        tokens_b = self.tokenizer.tokenize(t2)
-
-        # combine to one sample
-        cur_example = InputExample(guid=cur_id, tokens_a=tokens_a, tokens_b=tokens_b, is_next=is_next_label)
-
-        # transform sample to features
-        cur_features = convert_example_to_features(cur_example, self.seq_len, self.tokenizer)
-
-        cur_tensors = (torch.tensor(cur_features.input_ids),
-                       torch.tensor(cur_features.input_mask),
-                       torch.tensor(cur_features.segment_ids),
-                       torch.tensor(cur_features.lm_label_ids),
-                       torch.tensor(cur_features.is_next))
-
-        return cur_tensors
-
-    def random_sent(self, index):
-        """
-        Get one sample from corpus consisting of two sentences. With prob. 50% these are two subsequent sentences
-        from one doc. With 50% the second sentence will be a random one from another doc.
-        :param index: int, index of sample.
-        :return: (str, str, int), sentence 1, sentence 2, isNextSentence Label
-        """
-        t1, t2 = self.get_corpus_line(index)
-        if random.random() > 0.5:
-            label = 0
-        else:
-            t2 = self.get_random_line()
-            label = 1
-
-        assert len(t1) > 0
-        assert len(t2) > 0
-        return t1, t2, label
-
-    def get_corpus_line(self, item):
-        """
-        Get one sample from corpus consisting of a pair of two subsequent lines from the same doc.
-        :param item: int, index of sample.
-        :return: (str, str), two subsequent sentences from corpus
-        """
-        t1 = ""
-        t2 = ""
-        assert item < self.corpus_lines
-        if self.on_memory:
-            sample = self.sample_to_doc[item]
-            t1 = self.all_docs[sample["doc_id"]][sample["line"]]
-            t2 = self.all_docs[sample["doc_id"]][sample["line"]+1]
-            # used later to avoid random nextSentence from same doc
-            self.current_doc = sample["doc_id"]
-            return t1, t2
-        else:
-            if self.line_buffer is None:
-                # read first non-empty line of file
-                while t1 == "" :
-                    t1 = next(self.file).strip()
-                    t2 = next(self.file).strip()
-            else:
-                # use t2 from previous iteration as new t1
-                t1 = self.line_buffer
-                t2 = next(self.file).strip()
-                # skip empty rows that are used for separating documents and keep track of current doc id
-                while t2 == "" or t1 == "":
-                    t1 = next(self.file).strip()
-                    t2 = next(self.file).strip()
-                    self.current_doc = self.current_doc+1
-            self.line_buffer = t2
-
-        assert t1 != ""
-        assert t2 != ""
-        return t1, t2
-
-    def get_random_line(self):
-        """
-        Get random line from another document for nextSentence task.
-        :return: str, content of one line
-        """
-        # Similar to original tf repo: This outer loop should rarely go for more than one iteration for large
-        # corpora. However, just to be careful, we try to make sure that
-        # the random document is not the same as the document we're processing.
-        for _ in range(10):
-            if self.on_memory:
-                rand_doc_idx = random.randint(0, len(self.all_docs)-1)
-                rand_doc = self.all_docs[rand_doc_idx]
-                line = rand_doc[random.randrange(len(rand_doc))]
-            else:
-                rand_index = random.randint(1, self.corpus_lines if self.corpus_lines < 1000 else 1000)
-                #pick random line
-                for _ in range(rand_index):
-                    line = self.get_next_line()
-            #check if our picked random line is really from another doc like we want it to be
-            if self.current_random_doc != self.current_doc:
-                break
-        return line
-
-    def get_next_line(self):
-        """ Gets next line of random_file and starts over when reaching end of file"""
-        try:
-            line = next(self.random_file).strip()
-            #keep track of which document we are currently looking at to later avoid having the same doc as t1
-            if line == "":
-                self.current_random_doc = self.current_random_doc + 1
-                line = next(self.random_file).strip()
-        except StopIteration:
-            self.random_file.close()
-            self.random_file = open(self.corpus_path, "r", encoding=self.encoding)
-            line = next(self.random_file).strip()
-        return line
-
-
-class InputExample(object):
-    """A single training/test example for the language model."""
-
-    def __init__(self, guid, tokens_a, tokens_b=None, is_next=None, lm_labels=None):
-        """Constructs a InputExample.
-
-        Args:
-            guid: Unique id for the example.
-            tokens_a: string. The untokenized text of the first sequence. For single
-            sequence tasks, only this sequence must be specified.
-            tokens_b: (Optional) string. The untokenized text of the second sequence.
-            Only must be specified for sequence pair tasks.
-            label: (Optional) string. The label of the example. This should be
-            specified for train and dev examples, but not for test examples.
-        """
-        self.guid = guid
-        self.tokens_a = tokens_a
-        self.tokens_b = tokens_b
-        self.is_next = is_next  # nextSentence
-        self.lm_labels = lm_labels  # masked words for language model
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, input_ids, input_mask, segment_ids, is_next, lm_label_ids):
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.is_next = is_next
-        self.lm_label_ids = lm_label_ids
-
-
-def random_word(tokens, tokenizer):
-    """
-    Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
-    :param tokens: list of str, tokenized sentence.
-    :param tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
-    :return: (list of str, list of int), masked tokens and related labels for LM prediction
-    """
-    output_label = []
-
-    for i, token in enumerate(tokens):
-        prob = random.random()
-        # mask token with 15% probability
-        if prob < 0.15:
-            prob /= 0.15
-
-            # 80% randomly change token to mask token
-            if prob < 0.8:
-                tokens[i] = "[MASK]"
-
-            # 10% randomly change token to random token
-            elif prob < 0.9:
-                tokens[i] = random.choice(list(tokenizer.vocab.items()))[0]
-
-            # -> rest 10% randomly keep current token
-
-            # append current token to output (we will predict these later)
-            try:
-                output_label.append(tokenizer.vocab[token])
-            except KeyError:
-                # For unknown words (should not occur with BPE vocab)
-                output_label.append(tokenizer.vocab["[UNK]"])
-                logger.warning("Cannot find token '{}' in vocab. Using [UNK] insetad".format(token))
-        else:
-            # no masking token (will be ignored by loss function later)
-            output_label.append(-1)
-
-    return tokens, output_label
-
-
-def convert_example_to_features(example, max_seq_length, tokenizer):
-    """
-    Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with
-    IDs, LM labels, input_mask, CLS and SEP tokens etc.
-    :param example: InputExample, containing sentence input as strings and is_next label
-    :param max_seq_length: int, maximum length of sequence.
-    :param tokenizer: Tokenizer
-    :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training)
-    """
-    tokens_a = example.tokens_a
-    tokens_b = example.tokens_b
-    # Modifies `tokens_a` and `tokens_b` in place so that the total
-    # length is less than the specified length.
-    # Account for [CLS], [SEP], [SEP] with "- 3"
-    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-
-    tokens_a, t1_label = random_word(tokens_a, tokenizer)
-    tokens_b, t2_label = random_word(tokens_b, tokenizer)
-    # concatenate lm labels and account for CLS, SEP, SEP
-    lm_label_ids = ([-1] + t1_label + [-1] + t2_label + [-1])
-
-    # The convention in BERT is:
-    # (a) For sequence pairs:
-    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-    #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
-    # (b) For single sequences:
-    #  tokens:   [CLS] the dog is hairy . [SEP]
-    #  type_ids: 0   0   0   0  0     0 0
-    #
-    # Where "type_ids" are used to indicate whether this is the first
-    # sequence or the second sequence. The embedding vectors for `type=0` and
-    # `type=1` were learned during pre-training and are added to the wordpiece
-    # embedding vector (and position vector). This is not *strictly* necessary
-    # since the [SEP] token unambigiously separates the sequences, but it makes
-    # it easier for the model to learn the concept of sequences.
-    #
-    # For classification tasks, the first vector (corresponding to [CLS]) is
-    # used as as the "sentence vector". Note that this only makes sense because
-    # the entire model is fine-tuned.
-    tokens = []
-    segment_ids = []
-    tokens.append("[CLS]")
-    segment_ids.append(0)
-    for token in tokens_a:
-        tokens.append(token)
-        segment_ids.append(0)
-    tokens.append("[SEP]")
-    segment_ids.append(0)
-
-    assert len(tokens_b) > 0
-    for token in tokens_b:
-        tokens.append(token)
-        segment_ids.append(1)
-    tokens.append("[SEP]")
-    segment_ids.append(1)
-
-    input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-    # The mask has 1 for real tokens and 0 for padding tokens. Only real
-    # tokens are attended to.
-    input_mask = [1] * len(input_ids)
-
-    # Zero-pad up to the sequence length.
-    while len(input_ids) < max_seq_length:
-        input_ids.append(0)
-        input_mask.append(0)
-        segment_ids.append(0)
-        lm_label_ids.append(-1)
-
-    assert len(input_ids) == max_seq_length
-    assert len(input_mask) == max_seq_length
-    assert len(segment_ids) == max_seq_length
-    assert len(lm_label_ids) == max_seq_length
-
-    if example.guid < 5:
-        logger.info("*** Example ***")
-        logger.info("guid: %s" % (example.guid))
-        logger.info("tokens: %s" % " ".join(
-                [str(x) for x in tokens]))
-        logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-        logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-        logger.info(
-                "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-        logger.info("LM label: %s " % (lm_label_ids))
-        logger.info("Is next sentence label: %s " % (example.is_next))
-
-    features = InputFeatures(input_ids=input_ids,
-                             input_mask=input_mask,
-                             segment_ids=segment_ids,
-                             lm_label_ids=lm_label_ids,
-                             is_next=example.is_next)
-    return features
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    ## Required parameters
-    parser.add_argument("--train_corpus",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The input train corpus.")
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
-    parser.add_argument("--output_dir",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The output directory where the model checkpoints will be written.")
-
-    ## Other parameters
-    parser.add_argument("--max_seq_length",
-                        default=128,
-                        type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, and sequences shorter \n"
-                             "than this will be padded.")
-    parser.add_argument("--do_train",
-                        action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--train_batch_size",
-                        default=32,
-                        type=int,
-                        help="Total batch size for training.")
-    parser.add_argument("--learning_rate",
-                        default=3e-5,
-                        type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--adam_epsilon", 
-                        default=1e-8, 
-                        type=float,
-                        help="Epsilon for Adam optimizer.")
-    parser.add_argument("--num_train_epochs",
-                        default=3.0,
-                        type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--warmup_steps", 
-                        default=0, 
-                        type=int,
-                        help="Linear warmup over warmup_steps.")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument("--on_memory",
-                        action='store_true',
-                        help="Whether to load train samples into memory or use disk")
-    parser.add_argument("--do_lower_case",
-                        action='store_true',
-                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--seed',
-                        type=int,
-                        default=42,
-                        help="random seed for initialization")
-    parser.add_argument('--gradient_accumulation_steps',
-                        type=int,
-                        default=1,
-                        help="Number of updates steps to accumualte before performing a backward/update pass.")
-    parser.add_argument('--fp16',
-                        action='store_true',
-                        help="Whether to use 16-bit float precision instead of 32-bit")
-    parser.add_argument('--loss_scale',
-                        type = float, default = 0,
-                        help = "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                        "0 (default value): dynamic loss scaling.\n"
-                        "Positive power of 2: static loss scaling value.\n")
-
-    args = parser.parse_args()
-
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.distributed.init_process_group(backend='nccl')
-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        device, n_gpu, bool(args.local_rank != -1), args.fp16))
-
-    if args.gradient_accumulation_steps < 1:
-        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-                            args.gradient_accumulation_steps))
-
-    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-    if not args.do_train:
-        raise ValueError("Training is currently the only implemented execution option. Please set `do_train`.")
-
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
-        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    if not os.path.exists(args.output_dir) and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        os.makedirs(args.output_dir)
-
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-
-    #train_examples = None
-    num_train_optimization_steps = None
-    if args.do_train:
-        print("Loading Train Dataset", args.train_corpus)
-        train_dataset = BERTDataset(args.train_corpus, tokenizer, seq_len=args.max_seq_length,
-                                    corpus_lines=None, on_memory=args.on_memory)
-        num_train_optimization_steps = int(
-            len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
-        if args.local_rank != -1:
-            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
-
-    # Prepare model
-    model = BertForPreTraining.from_pretrained(args.bert_model)
-    if args.fp16:
-        model.half()
-    model.to(device)
-    if args.local_rank != -1:
-        try:
-            from apex.parallel import DistributedDataParallel as DDP
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-        model = DDP(model)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Prepare optimizer
-    if args.do_train:
-        param_optimizer = list(model.named_parameters())
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
-
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer
-                from apex.optimizers import FusedAdam
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-
-        else:
-            optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)
-
-    global_step = 0
-    if args.do_train:
-        logger.info("***** Running training *****")
-        logger.info("  Num examples = %d", len(train_dataset))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
-
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_dataset)
-        else:
-            #TODO: check if this works with current data generator from disk that relies on next(file)
-            # (it doesn't return item back by index)
-            train_sampler = DistributedSampler(train_dataset)
-        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-        model.train()
-        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
-            tr_loss = 0
-            nb_tr_examples, nb_tr_steps = 0, 0
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
-                batch = tuple(t.to(device) for t in batch)
-                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
-                outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
-                loss = outputs[0]
-                if n_gpu > 1:
-                    loss = loss.mean() # mean() to average on multi-gpu.
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-                tr_loss += loss.item()
-                nb_tr_examples += input_ids.size(0)
-                nb_tr_steps += 1
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    optimizer.step()
-                    scheduler.step()  # Update learning rate schedule
-                    optimizer.zero_grad()
-                    global_step += 1
-
-        # Save a trained model
-        if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-            logger.info("** ** * Saving fine - tuned model ** ** * ")
-            model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
-            model_to_save.save_pretrained(args.output_dir)
-            tokenizer.save_pretrained(args.output_dir)
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-def accuracy(out, labels):
-    outputs = np.argmax(out, axis=1)
-    return np.sum(outputs == labels)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -32,7 +32,7 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse
 from torch.utils.data.distributed import DistributedSampler
 from torch.nn import CrossEntropyLoss, MSELoss

-from pytorch_transformers import (WEIGHTS_NAME,
+from transformers import (WEIGHTS_NAME,
                                  BertConfig, BertForSequenceClassification, BertTokenizer,
                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer,
                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/Transformer-XL/XLNet)
+""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
 """
 from __future__ import absolute_import, division, print_function, unicode_literals

@@ -26,12 +26,14 @@ import torch
 import torch.nn.functional as F
 import numpy as np

-from pytorch_transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig
+from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig, CTRLConfig

-from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
-from pytorch_transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
-from pytorch_transformers import XLNetLMHeadModel, XLNetTokenizer
-from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
+from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
+from transformers import XLNetLMHeadModel, XLNetTokenizer
+from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
+from transformers import CTRLLMHeadModel, CTRLTokenizer
+from transformers import XLMWithLMHeadModel, XLMTokenizer


 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
@@ -41,13 +43,15 @@ logger = logging.getLogger(__name__)

 MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig)), ())
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig, CTRLConfig)), ())

 MODEL_CLASSES = {
    'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
+    'ctrl': (CTRLLMHeadModel, CTRLTokenizer),
    'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    'xlnet': (XLNetLMHeadModel, XLNetTokenizer),
    'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer),
+    'xlm': (XLMWithLMHeadModel, XLMTokenizer),
 }

 # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
@@ -103,7 +107,7 @@ def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')
    return logits


-def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, is_xlnet=False, device='cpu'):
+def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0, is_xlnet=False, xlm_lang=None, device='cpu'):
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context
@@ -121,10 +125,21 @@ def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=
                target_mapping[0, 0, -1] = 1.0  # predict last token
                inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}

+            if xlm_lang is not None:
+                inputs["langs"] = torch.tensor([xlm_lang] * inputs["input_ids"].shape[1], device=device).view(1, -1)
+
            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
-            next_token_logits = outputs[0][0, -1, :] / temperature
+            next_token_logits = outputs[0][0, -1, :] / (temperature if temperature > 0 else 1.)
+
+            # reptition penalty from CTRL (https://arxiv.org/abs/1909.05858)
+            for _ in set(generated):
+                next_token_logits[_] /= repetition_penalty
+                
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
-            next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
+            if temperature == 0: #greedy sampling:
+                next_token = torch.argmax(filtered_logits).unsqueeze(0)
+            else:
+                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
    return generated

@@ -137,16 +152,25 @@ def main():
                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
    parser.add_argument("--prompt", type=str, default="")
    parser.add_argument("--padding_text", type=str, default="")
+    parser.add_argument("--xlm_lang", type=str, default="", help="Optional language when used with the XLM model.")
    parser.add_argument("--length", type=int, default=20)
-    parser.add_argument("--temperature", type=float, default=1.0)
+    parser.add_argument("--temperature", type=float, default=1.0,
+                        help="temperature of 0 implies greedy sampling")
+    parser.add_argument("--repetition_penalty", type=float, default=1.0,
+                        help="primarily useful for CTRL model; in that case, use 1.2")
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument("--top_p", type=float, default=0.9)
    parser.add_argument("--no_cuda", action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--seed', type=int, default=42,
                        help="random seed for initialization")
+    parser.add_argument('--stop_token', type=str, default=None,
+                        help="Token at which text generation is stopped")
    args = parser.parse_args()
-
+    if args.model_type in ["ctrl"]:
+        if args.temperature > 0.7 : 
+            print('CTRL typically works better with lower temperatures (and lower top_k).')
+        
    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()

@@ -168,6 +192,18 @@ def main():

    print(args)
    while True:
+        xlm_lang = None
+        # XLM Language usage detailed in the issues #1414
+        if args.model_type in ["xlm"] and hasattr(tokenizer, 'lang2id') and hasattr(model.config, 'use_lang_emb') \
+                and model.config.use_lang_emb:
+            if args.xlm_lang:
+                language = args.xlm_lang
+            else:
+                language = None
+                while language not in tokenizer.lang2id.keys():
+                    language = input("Using XLM. Select language in " + str(list(tokenizer.lang2id.keys())) + " >>> ")
+            xlm_lang = tokenizer.lang2id[language]
+
        raw_text = args.prompt if args.prompt else input("Model prompt >>> ")
        if args.model_type in ["transfo-xl", "xlnet"]:
            # Models with memory likes to have a long prompt for short inputs.
@@ -180,11 +216,16 @@ def main():
            temperature=args.temperature,
            top_k=args.top_k,
            top_p=args.top_p,
-            device=args.device,
+            repetition_penalty=args.repetition_penalty,
            is_xlnet=bool(args.model_type == "xlnet"),
+            xlm_lang=xlm_lang,
+            device=args.device,
        )
        out = out[0, len(context_tokens):].tolist()
-        text = tokenizer.decode(out, clean_up_tokenization_spaces=True)
+
+        text = tokenizer.decode(out, clean_up_tokenization_spaces=True, skip_special_tokens=True)
+        text = text[: text.find(args.stop_token) if args.stop_token else None]
+
        print(text)
        if args.prompt:
            break
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -31,7 +31,7 @@ from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange

-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                  BertForSequenceClassification, BertTokenizer,
                                  RobertaConfig,
                                  RobertaForSequenceClassification,
@@ -39,22 +39,29 @@ from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
                                  XLMConfig, XLMForSequenceClassification,
                                  XLMTokenizer, XLNetConfig,
                                  XLNetForSequenceClassification,
-                                  XLNetTokenizer)
+                                  XLNetTokenizer,
+                                  DistilBertConfig,
+                                  DistilBertForSequenceClassification,
+                                  DistilBertTokenizer)

-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule

-from utils_glue import (compute_metrics, convert_examples_to_features,
-                        output_modes, processors)
+from transformers import glue_compute_metrics as compute_metrics
+from transformers import glue_output_modes as output_modes
+from transformers import glue_processors as processors
+from transformers import glue_convert_examples_to_features as convert_examples_to_features

 logger = logging.getLogger(__name__)

-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig)), ())
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, 
+                                                                                RobertaConfig, DistilBertConfig)), ())

 MODEL_CLASSES = {
    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
+    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
 }


@@ -128,10 +135,11 @@ def train(args, train_dataset, model, tokenizer):
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
-                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM and RoBERTa don't use segment_ids
                      'labels':         batch[3]}
+            if args.model_type != 'distilbert':
+                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu parallel training
@@ -148,8 +156,8 @@ def train(args, train_dataset, model, tokenizer):

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
-                scheduler.step()  # Update learning rate schedule
                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

@@ -218,8 +226,9 @@ def evaluate(args, model, tokenizer, prefix=""):
            with torch.no_grad():
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
-                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM and RoBERTa don't use segment_ids
                          'labels':         batch[3]}
+                if args.model_type != 'distilbert':
+                    inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

@@ -240,7 +249,7 @@ def evaluate(args, model, tokenizer, prefix=""):
        result = compute_metrics(eval_task, preds, out_label_ids)
        results.update(result)

-        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
+        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
@@ -262,7 +271,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
        list(filter(None, args.model_name_or_path.split('/'))).pop(),
        str(args.max_seq_length),
        str(task)))
-    if os.path.exists(cached_features_file):
+    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
@@ -272,15 +281,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1] 
        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
-            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
-            cls_token=tokenizer.cls_token,
-            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
-            sep_token=tokenizer.sep_token,
-            sep_token_extra=bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
-            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
+        features = convert_examples_to_features(examples,
+                                                tokenizer,
+                                                label_list=label_list,
+                                                max_length=args.max_seq_length,
+                                                output_mode=output_mode,
+                                                pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
+                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+                                                pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -291,14 +299,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
-        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    elif output_mode == "regression":
-        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)

-    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset


@@ -478,13 +486,15 @@ def main():
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
+            
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=global_step)
+            result = evaluate(args, model, tokenizer, prefix=prefix)
            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)

--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Fine-tuning the library models for language modeling on WikiText-2 (GPT, GPT-2, BERT, RoBERTa).
+Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
 GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
 using a masked language modeling (MLM) loss.
 """
@@ -27,6 +27,8 @@ import logging
 import os
 import pickle
 import random
+import re
+import shutil

 import numpy as np
 import torch
@@ -35,11 +37,12 @@ from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange

-from pytorch_transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
+from transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
                                  BertConfig, BertForMaskedLM, BertTokenizer,
                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
-                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
+                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
+                                  DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)


 logger = logging.getLogger(__name__)
@@ -49,7 +52,8 @@ MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
-    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
+    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
+    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
 }


@@ -57,7 +61,7 @@ class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path='train', block_size=512):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
-        cached_features_file = os.path.join(directory, f'cached_lm_{block_size}_{filename}')
+        cached_features_file = os.path.join(directory, 'cached_lm_' + block_size + '_' + filename)

        if os.path.exists(cached_features_file):
            logger.info("Loading features from cached file %s", cached_features_file)
@@ -72,9 +76,8 @@ class TextDataset(Dataset):

            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

-            while len(tokenized_text) >= block_size:  # Truncate in block of block_size
-                self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size]))
-                tokenized_text = tokenized_text[block_size:]
+            for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
+                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size]))
            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
            # If your dataset is small, first you should loook for a bigger one :-) and second you
            # can change this behavior by adding (model specific) padding.
@@ -103,11 +106,43 @@ def set_seed(args):
        torch.cuda.manual_seed_all(args.seed)


+def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False):
+    if not args.save_total_limit:
+        return
+    if args.save_total_limit <= 0:
+        return
+
+    # Check if we should delete older checkpoint(s)
+    glob_checkpoints = glob.glob(os.path.join(args.output_dir, '{}-*'.format(checkpoint_prefix)))
+    if len(glob_checkpoints) <= args.save_total_limit:
+        return
+
+    ordering_and_checkpoint_path = []
+    for path in glob_checkpoints:
+        if use_mtime:
+            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
+        else:
+            regex_match = re.match('.*{}-([0-9]+)'.format(checkpoint_prefix), path)
+            if regex_match and regex_match.groups():
+                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
+
+    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
+    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
+    for checkpoint in checkpoints_to_be_deleted:
+        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
+        shutil.rmtree(checkpoint)
+
+
 def mask_tokens(inputs, tokenizer, args):
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
-    masked_indices = torch.bernoulli(torch.full(labels.shape, args.mlm_probability)).bool()
+    probability_matrix = torch.full(labels.shape, args.mlm_probability)
+    special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()]
+    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
+    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -1  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
@@ -186,7 +221,7 @@ def train(args, train_dataset, model, tokenizer):
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
@@ -221,8 +256,9 @@ def train(args, train_dataset, model, tokenizer):
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    checkpoint_prefix = 'checkpoint'
                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
@@ -230,6 +266,8 @@ def train(args, train_dataset, model, tokenizer):
                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

+                    _rotate_checkpoints(args, checkpoint_prefix)
+
            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
@@ -247,7 +285,6 @@ def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

-    results = {}
    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
@@ -282,14 +319,14 @@ def evaluate(args, model, tokenizer, prefix=""):
        "perplexity": perplexity
    }

-    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
+    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

-    return results
+    return result


 def main():
@@ -359,6 +396,8 @@ def main():
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps', type=int, default=50,
                        help="Save checkpoint every X updates steps.")
+    parser.add_argument('--save_total_limit', type=int, default=None,
+                        help='Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default')
    parser.add_argument("--eval_all_checkpoints", action='store_true',
                        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number")
    parser.add_argument("--no_cuda", action='store_true',
@@ -381,7 +420,7 @@ def main():
    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
    args = parser.parse_args()

-    if args.model_type in ["bert", "roberta"] and not args.mlm:
+    if args.model_type in ["bert", "roberta", "distilbert"] and not args.mlm:
        raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
                         "flag (masked language modeling).")
    if args.eval_data_file is None and args.do_eval:
@@ -480,13 +519,15 @@ def main():
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
+            
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=global_step)
+            result = evaluate(args, model, tokenizer, prefix=prefix)
            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)

--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -0,0 +1,546 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import glob
+import logging
+import os
+import random
+
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+from tensorboardX import SummaryWriter
+from tqdm import tqdm, trange
+
+from transformers import (WEIGHTS_NAME, BertConfig,
+                                  BertForMultipleChoice, BertTokenizer,
+                                  XLNetConfig, XLNetForMultipleChoice,
+                                  XLNetTokenizer, RobertaConfig,
+                                  RobertaForMultipleChoice, RobertaTokenizer)
+
+from transformers import AdamW, WarmupLinearSchedule
+
+from utils_multiple_choice import (convert_examples_to_features, processors)
+
+logger = logging.getLogger(__name__)
+
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), ())
+
+MODEL_CLASSES = {
+    'bert': (BertConfig, BertForMultipleChoice, BertTokenizer),
+    'xlnet': (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
+    'roberta': (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer)
+}
+
+def select_field(features, field):
+    return [
+        [
+            choice[field]
+            for choice in feature.choices_features
+        ]
+        for feature in features
+    ]
+
+
+def simple_accuracy(preds, labels):
+    return (preds == labels).mean()
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def train(args, train_dataset, model, tokenizer):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    best_dev_acc, best_dev_loss = 0.0, 99999999999.0
+    best_steps = 0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {'input_ids':      batch[0],
+                      'attention_mask': batch[1],
+                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
+                      'labels':         batch[3]}
+            outputs = model(**inputs)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+
+            if args.n_gpu > 1:
+                loss = loss.mean() # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+            else:
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                        if results["eval_acc"] > best_dev_acc:
+                            best_dev_acc = results["eval_acc"]
+                            best_dev_loss = results["eval_loss"]
+                            best_steps = global_step
+                            if args.do_test:
+                                results_test = evaluate(args, model, tokenizer, test=True)
+                                for key, value in results_test.items():
+                                    tb_writer.add_scalar('test_{}'.format(key), value, global_step)
+                                logger.info("test acc: %s, loss: %s, global steps: %s", str(results_test['eval_acc']), str(results_test['eval_loss']), str(global_step))
+                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                    logger.info("Average loss: %s at global step: %s", str((tr_loss - logging_loss)/args.logging_steps), str(global_step))
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    tokenizer.save_vocabulary(output_dir)
+                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step, best_steps
+
+
+def evaluate(args, model, tokenizer, prefix="", test=False):
+    eval_task_names = (args.task_name,)
+    eval_outputs_dirs = (args.output_dir,)
+
+    results = {}
+    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
+        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=not test, test=test)
+
+        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(eval_output_dir)
+
+        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+        # Note that DistributedSampler samples randomly
+        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+        # Eval!
+        logger.info("***** Running evaluation {} *****".format(prefix))
+        logger.info("  Num examples = %d", len(eval_dataset))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        eval_loss = 0.0
+        nb_eval_steps = 0
+        preds = None
+        out_label_ids = None
+        for batch in tqdm(eval_dataloader, desc="Evaluating"):
+            model.eval()
+            batch = tuple(t.to(args.device) for t in batch)
+
+            with torch.no_grad():
+                inputs = {'input_ids':      batch[0],
+                          'attention_mask': batch[1],
+                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
+                          'labels':         batch[3]}
+                outputs = model(**inputs)
+                tmp_eval_loss, logits = outputs[:2]
+
+                eval_loss += tmp_eval_loss.mean().item()
+            nb_eval_steps += 1
+            if preds is None:
+                preds = logits.detach().cpu().numpy()
+                out_label_ids = inputs['labels'].detach().cpu().numpy()
+            else:
+                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+
+        eval_loss = eval_loss / nb_eval_steps
+        preds = np.argmax(preds, axis=1)
+        acc = simple_accuracy(preds, out_label_ids)
+        result = {"eval_acc": acc, "eval_loss": eval_loss}
+        results.update(result)
+
+        output_eval_file = os.path.join(eval_output_dir, "is_test_" + str(test).lower() + "_eval_results.txt")
+
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results {} *****".format(str(prefix) + " is test:" + str(test)))
+            writer.write("model           =%s\n" % str(args.model_name_or_path))
+            writer.write("total batch size=%d\n" % (args.per_gpu_train_batch_size * args.gradient_accumulation_steps *
+                         (torch.distributed.get_world_size() if args.local_rank != -1 else 1)))
+            writer.write("train num epochs=%d\n" % args.num_train_epochs)
+            writer.write("fp16            =%s\n" % args.fp16)
+            writer.write("max seq length  =%d\n" % args.max_seq_length)
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+    return results
+
+
+def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    processor = processors[task]()
+    # Load data features from cache or dataset file
+    if evaluate:
+        cached_mode = 'dev'
+    elif test:
+        cached_mode = 'test'
+    else:
+        cached_mode = 'train'
+    assert (evaluate == True and test == True) == False
+    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
+        cached_mode,
+        list(filter(None, args.model_name_or_path.split('/'))).pop(),
+        str(args.max_seq_length),
+        str(task)))
+    if os.path.exists(cached_features_file) and not args.overwrite_cache:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+        label_list = processor.get_labels()
+        if evaluate:
+            examples = processor.get_dev_examples(args.data_dir)
+        elif test:
+            examples = processor.get_test_examples(args.data_dir)
+        else:
+            examples = processor.get_train_examples(args.data_dir)
+        logger.info("Training number: %s", str(len(examples)))
+        features = convert_examples_to_features(
+            examples,
+            label_list,
+            args.max_seq_length,
+            tokenizer,
+            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
+            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0
+        )
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long)
+    all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long)
+    all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long)
+    all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long)
+
+    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--task_name", default=None, type=str, required=True,
+                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+
+    ## Other parameters
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set')
+    parser.add_argument("--evaluate_during_training", action='store_true',
+                        help="Run evaluation during training at each logging step.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
+
+    parser.add_argument('--logging_steps', type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument('--save_steps', type=int, default=50,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action='store_true',
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Avoid using CUDA when available")
+    parser.add_argument('--overwrite_output_dir', action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+
+    parser.add_argument('--fp16', action='store_true',
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    parser.add_argument('--fp16_opt_level', type=str, default='O1',
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html")
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="For distributed training: local_rank")
+    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    args = parser.parse_args()
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend='nccl')
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+
+    # Set seed
+    set_seed(args)
+
+    # Prepare GLUE task
+    args.task_name = args.task_name.lower()
+    if args.task_name not in processors:
+        raise ValueError("Task not found: %s" % (args.task_name))
+    processor = processors[args.task_name]()
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    model.to(args.device)
+
+    logger.info("Training/evaluation parameters %s", args)
+    best_steps = 0
+
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
+        global_step, tr_loss, best_steps = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+
+
+    # Evaluation
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        if not args.do_train:
+            args.output_dir = args.model_name_or_path
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
+            
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, prefix=prefix)
+            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            results.update(result)
+
+    if args.do_test and args.local_rank in [-1, 0]:
+        if not args.do_train:
+            args.output_dir = args.model_name_or_path
+        checkpoints = [args.output_dir]
+        # if args.eval_all_checkpoints: # can not use this to do test!!
+        #     checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+        #     logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
+            
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, prefix=prefix, test=True)
+            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            results.update(result)
+    if best_steps:
+        logger.info("best steps of eval acc is the following checkpoints: %s", best_steps)
+    return results
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for question-answering on SQuAD (Bert, XLM, XLNet)."""
+""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""

 from __future__ import absolute_import, division, print_function

@@ -32,14 +32,15 @@ from tqdm import tqdm, trange

 from tensorboardX import SummaryWriter

-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                  BertForQuestionAnswering, BertTokenizer,
                                  XLMConfig, XLMForQuestionAnswering,
                                  XLMTokenizer, XLNetConfig,
                                  XLNetForQuestionAnswering,
-                                  XLNetTokenizer)
+                                  XLNetTokenizer,
+                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)

-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule

 from utils_squad import (read_squad_examples, convert_examples_to_features,
                         RawResult, write_predictions,
@@ -59,6 +60,7 @@ MODEL_CLASSES = {
    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
+    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
 }

 def set_seed(args):
@@ -133,14 +135,15 @@ def train(args, train_dataset, model, tokenizer):
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {'input_ids':       batch[0],
                      'attention_mask':  batch[1], 
-                      'token_type_ids':  None if args.model_type == 'xlm' else batch[2],  
                      'start_positions': batch[3], 
                      'end_positions':   batch[4]}
+            if args.model_type != 'distilbert':
+                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[5],
                               'p_mask':       batch[6]})
            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
@@ -216,9 +219,10 @@ def evaluate(args, model, tokenizer, prefix=""):
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+                      'attention_mask': batch[1]
                      }
+            if args.model_type != 'distilbert':
+                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
            example_indices = batch[3]
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[4],
@@ -508,7 +512,7 @@ def main():
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs

        logger.info("Evaluate the following checkpoints: %s", checkpoints)

--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -0,0 +1,40 @@
+import tensorflow as tf
+import tensorflow_datasets
+from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features, BertForSequenceClassification
+
+# Load dataset, tokenizer, model from pretrained model/vocabulary
+tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
+data = tensorflow_datasets.load('glue/mrpc')
+
+# Prepare dataset for GLUE as a tf.data.Dataset instance
+train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc')
+valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc')
+train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
+valid_dataset = valid_dataset.batch(64)
+
+# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
+optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
+model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+
+# Train and evaluate using tf.keras.Model.fit()
+history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
+                    validation_data=valid_dataset, validation_steps=7)
+
+# Load the TensorFlow model in PyTorch for inspection
+model.save_pretrained('./save/')
+pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
+
+# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
+sentence_0 = "This research was consistent with his findings."
+sentence_1 = "His findings were compatible with this research."
+sentence_2 = "His findings were not compatible with this research."
+inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
+inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
+
+pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
+pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
+print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
+print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
--- a/examples/single_model_scripts/run_swag.py
+++ b/examples/single_model_scripts/run_swag.py
@@ -1,555 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT finetuning runner."""
-
-from __future__ import absolute_import
-
-import argparse
-import csv
-import logging
-import os
-import random
-import sys
-from io import open
-
-import numpy as np
-import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-from pytorch_transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
-from pytorch_transformers.modeling_bert import BertForMultipleChoice, BertConfig
-from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
-from pytorch_transformers.tokenization_bert import BertTokenizer
-
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class SwagExample(object):
-    """A single training/test example for the SWAG dataset."""
-    def __init__(self,
-                 swag_id,
-                 context_sentence,
-                 start_ending,
-                 ending_0,
-                 ending_1,
-                 ending_2,
-                 ending_3,
-                 label = None):
-        self.swag_id = swag_id
-        self.context_sentence = context_sentence
-        self.start_ending = start_ending
-        self.endings = [
-            ending_0,
-            ending_1,
-            ending_2,
-            ending_3,
-        ]
-        self.label = label
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        l = [
-            "swag_id: {}".format(self.swag_id),
-            "context_sentence: {}".format(self.context_sentence),
-            "start_ending: {}".format(self.start_ending),
-            "ending_0: {}".format(self.endings[0]),
-            "ending_1: {}".format(self.endings[1]),
-            "ending_2: {}".format(self.endings[2]),
-            "ending_3: {}".format(self.endings[3]),
-        ]
-
-        if self.label is not None:
-            l.append("label: {}".format(self.label))
-
-        return ", ".join(l)
-
-
-class InputFeatures(object):
-    def __init__(self,
-                 example_id,
-                 choices_features,
-                 label
-
-    ):
-        self.example_id = example_id
-        self.choices_features = [
-            {
-                'input_ids': input_ids,
-                'input_mask': input_mask,
-                'segment_ids': segment_ids
-            }
-            for _, input_ids, input_mask, segment_ids in choices_features
-        ]
-        self.label = label
-
-
-def read_swag_examples(input_file, is_training):
-    with open(input_file, 'r', encoding='utf-8') as f:
-        reader = csv.reader(f)
-        lines = []
-        for line in reader:
-            if sys.version_info[0] == 2:
-                line = list(unicode(cell, 'utf-8') for cell in line)
-            lines.append(line)
-
-    if is_training and lines[0][-1] != 'label':
-        raise ValueError(
-            "For training, the input file must contain a label column."
-        )
-
-    examples = [
-        SwagExample(
-            swag_id = line[2],
-            context_sentence = line[4],
-            start_ending = line[5], # in the swag dataset, the
-                                         # common beginning of each
-                                         # choice is stored in "sent2".
-            ending_0 = line[7],
-            ending_1 = line[8],
-            ending_2 = line[9],
-            ending_3 = line[10],
-            label = int(line[11]) if is_training else None
-        ) for line in lines[1:] # we skip the line with the column names
-    ]
-
-    return examples
-
-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 is_training):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    # Swag is a multiple choice task. To perform this task using Bert,
-    # we will use the formatting proposed in "Improving Language
-    # Understanding by Generative Pre-Training" and suggested by
-    # @jacobdevlin-google in this issue
-    # https://github.com/google-research/bert/issues/38.
-    #
-    # Each choice will correspond to a sample on which we run the
-    # inference. For a given Swag example, we will create the 4
-    # following inputs:
-    # - [CLS] context [SEP] choice_1 [SEP]
-    # - [CLS] context [SEP] choice_2 [SEP]
-    # - [CLS] context [SEP] choice_3 [SEP]
-    # - [CLS] context [SEP] choice_4 [SEP]
-    # The model will output a single value for each input. To get the
-    # final decision of the model, we will run a softmax over these 4
-    # outputs.
-    features = []
-    for example_index, example in enumerate(examples):
-        context_tokens = tokenizer.tokenize(example.context_sentence)
-        start_ending_tokens = tokenizer.tokenize(example.start_ending)
-
-        choices_features = []
-        for ending_index, ending in enumerate(example.endings):
-            # We create a copy of the context tokens in order to be
-            # able to shrink it according to ending_tokens
-            context_tokens_choice = context_tokens[:]
-            ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
-            # Modifies `context_tokens_choice` and `ending_tokens` in
-            # place so that the total length is less than the
-            # specified length.  Account for [CLS], [SEP], [SEP] with
-            # "- 3"
-            _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
-
-            tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
-            segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
-
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
-            input_mask = [1] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            padding = [0] * (max_seq_length - len(input_ids))
-            input_ids += padding
-            input_mask += padding
-            segment_ids += padding
-
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-
-            choices_features.append((tokens, input_ids, input_mask, segment_ids))
-
-        label = example.label
-        if example_index < 5:
-            logger.info("*** Example ***")
-            logger.info("swag_id: {}".format(example.swag_id))
-            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
-                logger.info("choice: {}".format(choice_idx))
-                logger.info("tokens: {}".format(' '.join(tokens)))
-                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
-                logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
-                logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
-            if is_training:
-                logger.info("label: {}".format(label))
-
-        features.append(
-            InputFeatures(
-                example_id = example.swag_id,
-                choices_features = choices_features,
-                label = label
-            )
-        )
-
-    return features
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-def accuracy(out, labels):
-    outputs = np.argmax(out, axis=1)
-    return np.sum(outputs == labels)
-
-def select_field(features, field):
-    return [
-        [
-            choice[field]
-            for choice in feature.choices_features
-        ]
-        for feature in features
-    ]
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    ## Required parameters
-    parser.add_argument("--data_dir",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The input data dir. Should contain the .csv files (or other data files) for the task.")
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
-                        "bert-base-multilingual-cased, bert-base-chinese.")
-    parser.add_argument("--output_dir",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The output directory where the model checkpoints will be written.")
-
-    ## Other parameters
-    parser.add_argument("--max_seq_length",
-                        default=128,
-                        type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, and sequences shorter \n"
-                             "than this will be padded.")
-    parser.add_argument("--do_train",
-                        action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval",
-                        action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_lower_case",
-                        action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-    parser.add_argument("--train_batch_size",
-                        default=32,
-                        type=int,
-                        help="Total batch size for training.")
-    parser.add_argument("--eval_batch_size",
-                        default=8,
-                        type=int,
-                        help="Total batch size for eval.")
-    parser.add_argument("--learning_rate",
-                        default=5e-5,
-                        type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--num_train_epochs",
-                        default=3.0,
-                        type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--warmup_proportion",
-                        default=0.1,
-                        type=float,
-                        help="Proportion of training to perform linear learning rate warmup for. "
-                             "E.g., 0.1 = 10%% of training.")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--seed',
-                        type=int,
-                        default=42,
-                        help="random seed for initialization")
-    parser.add_argument('--gradient_accumulation_steps',
-                        type=int,
-                        default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument('--fp16',
-                        action='store_true',
-                        help="Whether to use 16-bit float precision instead of 32-bit")
-    parser.add_argument('--loss_scale',
-                        type=float, default=0,
-                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                             "0 (default value): dynamic loss scaling.\n"
-                             "Positive power of 2: static loss scaling value.\n")
-
-    args = parser.parse_args()
-
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.distributed.init_process_group(backend='nccl')
-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        device, n_gpu, bool(args.local_rank != -1), args.fp16))
-
-    if args.gradient_accumulation_steps < 1:
-        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-                            args.gradient_accumulation_steps))
-
-    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-    if not args.do_train and not args.do_eval:
-        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
-
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
-        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-
-    # Prepare model
-    model = BertForMultipleChoice.from_pretrained(args.bert_model,
-        cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)),
-        num_choices=4)
-    if args.fp16:
-        model.half()
-    model.to(device)
-    if args.local_rank != -1:
-        try:
-            from apex.parallel import DistributedDataParallel as DDP
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-        model = DDP(model)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    if args.do_train:
-
-        # Prepare data loader
-
-        train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
-        train_features = convert_examples_to_features(
-            train_examples, tokenizer, args.max_seq_length, True)
-        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
-        all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
-        all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
-        all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-
-        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-        if args.local_rank != -1:
-            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
-
-        # Prepare optimizer
-
-        param_optimizer = list(model.named_parameters())
-
-        # hack to remove pooler, which is not used
-        # thus it produce None grad that break apex
-        param_optimizer = [n for n in param_optimizer]
-
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer
-                from apex.optimizers import FusedAdam
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                                 t_total=num_train_optimization_steps)
-        else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                 lr=args.learning_rate,
-                                 warmup=args.warmup_proportion,
-                                 t_total=num_train_optimization_steps)
-
-        global_step = 0
-
-        logger.info("***** Running training *****")
-        logger.info("  Num examples = %d", len(train_examples))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
-
-        model.train()
-        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
-            tr_loss = 0
-            nb_tr_examples, nb_tr_steps = 0, 0
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
-                batch = tuple(t.to(device) for t in batch)
-                input_ids, input_mask, segment_ids, label_ids = batch
-                loss = model(input_ids, segment_ids, input_mask, label_ids)
-                if n_gpu > 1:
-                    loss = loss.mean() # mean() to average on multi-gpu.
-                if args.fp16 and args.loss_scale != 1.0:
-                    # rescale loss for fp16 training
-                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
-                    loss = loss * args.loss_scale
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-                tr_loss += loss.item()
-                nb_tr_examples += input_ids.size(0)
-                nb_tr_steps += 1
-
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.fp16:
-                        # modify learning rate with special warm up BERT uses
-                        # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
-                        for param_group in optimizer.param_groups:
-                            param_group['lr'] = lr_this_step
-                    optimizer.step()
-                    optimizer.zero_grad()
-                    global_step += 1
-
-
-    if args.do_train:
-        # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-
-        torch.save(model_to_save.state_dict(), output_model_file)
-        model_to_save.config.to_json_file(output_config_file)
-        tokenizer.save_vocabulary(args.output_dir)
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-    else:
-        model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
-    model.to(device)
-
-
-    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True)
-        eval_features = convert_examples_to_features(
-            eval_examples, tokenizer, args.max_seq_length, True)
-        logger.info("***** Running evaluation *****")
-        logger.info("  Num examples = %d", len(eval_examples))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-        all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
-        all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
-        all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
-        all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
-        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
-        # Run prediction for full data
-        eval_sampler = SequentialSampler(eval_data)
-        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-        model.eval()
-        eval_loss, eval_accuracy = 0, 0
-        nb_eval_steps, nb_eval_examples = 0, 0
-        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.to(device)
-            segment_ids = segment_ids.to(device)
-            label_ids = label_ids.to(device)
-
-            with torch.no_grad():
-                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
-                logits = model(input_ids, segment_ids, input_mask)
-
-            logits = logits.detach().cpu().numpy()
-            label_ids = label_ids.to('cpu').numpy()
-            tmp_eval_accuracy = accuracy(logits, label_ids)
-
-            eval_loss += tmp_eval_loss.mean().item()
-            eval_accuracy += tmp_eval_accuracy
-
-            nb_eval_examples += input_ids.size(0)
-            nb_eval_steps += 1
-
-        eval_loss = eval_loss / nb_eval_steps
-        eval_accuracy = eval_accuracy / nb_eval_examples
-
-        result = {'eval_loss': eval_loss,
-                  'eval_accuracy': eval_accuracy,
-                  'global_step': global_step,
-                  'loss': tr_loss/global_step}
-
-        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results *****")
-            for key in sorted(result.keys()):
-                logger.info("  %s = %s", key, str(result[key]))
-                writer.write("%s = %s\n" % (key, str(result[key])))
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/utils_multiple_choice.py
+++ b/examples/utils_multiple_choice.py
@@ -0,0 +1,404 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension  """
+
+from __future__ import absolute_import, division, print_function
+
+
+import logging
+import os
+import sys
+from io import open
+import json
+import csv
+import glob
+import tqdm
+from typing import List
+from transformers import PreTrainedTokenizer
+
+
+logger = logging.getLogger(__name__)
+
+
+class InputExample(object):
+    """A single training/test example for multiple choice"""
+
+    def __init__(self, example_id, question, contexts, endings, label=None):
+        """Constructs a InputExample.
+
+        Args:
+            example_id: Unique id for the example.
+            contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
+            question: string. The untokenized text of the second sequence (question).
+            endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.example_id = example_id
+        self.question = question
+        self.contexts = contexts
+        self.endings = endings
+        self.label = label
+
+
+class InputFeatures(object):
+    def __init__(self,
+                 example_id,
+                 choices_features,
+                 label
+
+    ):
+        self.example_id = example_id
+        self.choices_features = [
+            {
+                'input_ids': input_ids,
+                'input_mask': input_mask,
+                'segment_ids': segment_ids
+            }
+            for input_ids, input_mask, segment_ids in choices_features
+        ]
+        self.label = label
+
+
+class DataProcessor(object):
+    """Base class for data converters for multiple choice data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_test_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the test set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+
+class RaceProcessor(DataProcessor):
+    """Processor for the RACE data set."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {} train".format(data_dir))
+        high = os.path.join(data_dir, 'train/high')
+        middle = os.path.join(data_dir, 'train/middle')
+        high = self._read_txt(high)
+        middle = self._read_txt(middle)
+        return self._create_examples(high + middle, 'train')
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {} dev".format(data_dir))
+        high = os.path.join(data_dir, 'dev/high')
+        middle = os.path.join(data_dir, 'dev/middle')
+        high = self._read_txt(high)
+        middle = self._read_txt(middle)
+        return self._create_examples(high + middle, 'dev')
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {} test".format(data_dir))
+        high = os.path.join(data_dir, 'test/high')
+        middle = os.path.join(data_dir, 'test/middle')
+        high = self._read_txt(high)
+        middle = self._read_txt(middle)
+        return self._create_examples(high + middle, 'test')
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1", "2", "3"]
+
+    def _read_txt(self, input_dir):
+        lines = []
+        files = glob.glob(input_dir + "/*txt")
+        for file in tqdm.tqdm(files, desc="read files"):
+            with open(file, 'r', encoding='utf-8') as fin:
+                data_raw = json.load(fin)
+                data_raw["race_id"] = file
+                lines.append(data_raw)
+        return lines
+
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (_, data_raw) in enumerate(lines):
+            race_id = "%s-%s" % (set_type, data_raw["race_id"])
+            article = data_raw["article"]
+            for i in range(len(data_raw["answers"])):
+                truth = str(ord(data_raw['answers'][i]) - ord('A'))
+                question = data_raw['questions'][i]
+                options = data_raw['options'][i]
+
+                examples.append(
+                    InputExample(
+                        example_id=race_id,
+                        question=question,
+                        contexts=[article, article, article, article], # this is not efficient but convenient
+                        endings=[options[0], options[1], options[2], options[3]],
+                        label=truth))
+        return examples
+
+class SwagProcessor(DataProcessor):
+    """Processor for the SWAG data set."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {} train".format(data_dir))
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {} dev".format(data_dir))
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {} dev".format(data_dir))
+        raise ValueError(
+            "For swag testing, the input file does not contain a label column. It can not be tested in current code"
+            "setting!"
+        )
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1", "2", "3"]
+
+    def _read_csv(self, input_file):
+        with open(input_file, 'r', encoding='utf-8') as f:
+            reader = csv.reader(f)
+            lines = []
+            for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
+
+
+    def _create_examples(self, lines: List[List[str]], type: str):
+        """Creates examples for the training and dev sets."""
+        if type == "train" and lines[0][-1] != 'label':
+            raise ValueError(
+                "For training, the input file must contain a label column."
+            )
+
+        examples = [
+            InputExample(
+                example_id=line[2],
+                question=line[5],  # in the swag dataset, the
+                # common beginning of each
+                # choice is stored in "sent2".
+                contexts = [line[4], line[4], line[4], line[4]],
+                endings = [line[7], line[8], line[9], line[10]],
+                label=line[11]
+            ) for line in lines[1:]  # we skip the line with the column names
+        ]
+
+        return examples
+
+
+class ArcProcessor(DataProcessor):
+    """Processor for the ARC data set (request from allennlp)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {} train".format(data_dir))
+        return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {} dev".format(data_dir))
+        return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")
+
+    def get_test_examples(self, data_dir):
+        logger.info("LOOKING AT {} test".format(data_dir))
+        return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1", "2", "3"]
+
+    def _read_json(self, input_file):
+        with open(input_file, 'r', encoding='utf-8') as fin:
+            lines = fin.readlines()
+            return lines
+
+
+    def _create_examples(self, lines, type):
+        """Creates examples for the training and dev sets."""
+
+        #There are two types of labels. They should be normalized
+        def normalize(truth):
+            if truth in "ABCD":
+                return ord(truth) - ord("A")
+            elif truth in "1234":
+                return int(truth) - 1
+            else:
+                logger.info("truth ERROR! %s", str(truth))
+                return None
+
+        examples = []
+        three_choice = 0
+        four_choice = 0
+        five_choice = 0
+        other_choices = 0
+        # we deleted example which has more than or less than four choices
+        for line in tqdm.tqdm(lines, desc="read arc data"):
+            data_raw = json.loads(line.strip("\n"))
+            if len(data_raw["question"]["choices"]) == 3:
+                three_choice += 1
+                continue
+            elif len(data_raw["question"]["choices"]) == 5:
+                five_choice += 1
+                continue
+            elif len(data_raw["question"]["choices"]) != 4:
+                other_choices += 1
+                continue
+            four_choice += 1
+            truth = str(normalize(data_raw["answerKey"]))
+            assert truth != "None"
+            question_choices = data_raw["question"]
+            question = question_choices["stem"]
+            id = data_raw["id"]
+            options = question_choices["choices"]
+            if len(options) == 4:
+                examples.append(
+                    InputExample(
+                        example_id = id,
+                        question=question,
+                        contexts=[options[0]["para"].replace("_", ""), options[1]["para"].replace("_", ""),
+                                  options[2]["para"].replace("_", ""), options[3]["para"].replace("_", "")],
+                        endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
+                        label=truth))
+
+        if type == "train":
+            assert len(examples) > 1
+            assert examples[0].label is not None
+        logger.info("len examples: %s}", str(len(examples)))
+        logger.info("Three choices: %s", str(three_choice))
+        logger.info("Five choices: %s", str(five_choice))
+        logger.info("Other choices: %s", str(other_choices))
+        logger.info("four choices: %s", str(four_choice))
+
+        return examples
+
+
+def convert_examples_to_features(
+    examples: List[InputExample],
+    label_list: List[str],
+    max_length: int,
+    tokenizer: PreTrainedTokenizer,
+    pad_token_segment_id=0,
+    pad_on_left=False,
+    pad_token=0,
+    mask_padding_with_zero=True,
+) -> List[InputFeatures]:
+    """
+    Loads a data file into a list of `InputFeatures`
+    """
+
+    label_map = {label : i for i, label in enumerate(label_list)}
+
+    features = []
+    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+        choices_features = []
+        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
+            text_a = context
+            if example.question.find("_") != -1:
+                # this is for cloze question
+                text_b = example.question.replace("_", ending)
+            else:
+                text_b = example.question + " " + ending
+
+            inputs = tokenizer.encode_plus(
+                text_a,
+                text_b,
+                add_special_tokens=True,
+                max_length=max_length,
+            )
+            if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0:
+                logger.info('Attention! you are cropping tokens (swag task is ok). '
+                        'If you are training ARC and RACE and you are poping question + options,'
+                        'you need to try to use a bigger max seq length!')
+
+            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding_length = max_length - len(input_ids)
+            if pad_on_left:
+                input_ids = ([pad_token] * padding_length) + input_ids
+                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
+                token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
+            else:
+                input_ids = input_ids + ([pad_token] * padding_length)
+                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+                token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
+
+            assert len(input_ids) == max_length
+            assert len(attention_mask) == max_length
+            assert len(token_type_ids) == max_length
+            choices_features.append((input_ids, attention_mask, token_type_ids))
+
+
+        label = label_map[example.label]
+
+        if ex_index < 2:
+            logger.info("*** Example ***")
+            logger.info("race_id: {}".format(example.example_id))
+            for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
+                logger.info("choice: {}".format(choice_idx))
+                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
+                logger.info("attention_mask: {}".format(' '.join(map(str, attention_mask))))
+                logger.info("token_type_ids: {}".format(' '.join(map(str, token_type_ids))))
+                logger.info("label: {}".format(label))
+
+        features.append(
+            InputFeatures(
+                example_id=example.example_id,
+                choices_features=choices_features,
+                label=label,
+            )
+        )
+
+    return features
+
+
+
+
+processors = {
+    "race": RaceProcessor,
+    "swag": SwagProcessor,
+    "arc": ArcProcessor
+}
+
+
+MULTIPLE_CHOICE_TASKS_NUM_LABELS = {
+    "race", 4,
+    "swag", 4,
+    "arc", 4
+}
--- a/examples/utils_squad.py
+++ b/examples/utils_squad.py
@@ -24,7 +24,7 @@ import math
 import collections
 from io import open

-from pytorch_transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
+from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize

 # Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
 from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,7 +1,7 @@
-from pytorch_transformers import (
+from transformers import (
    AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
 )
-from pytorch_transformers.modeling_utils import add_start_docstrings
+from transformers.file_utils import add_start_docstrings

 dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']

@@ -11,12 +11,12 @@ def config(*args, **kwargs):
                # Using torch.hub !
                import torch

-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
+                config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
+                config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+                config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/my_configuration.json')
+                config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
                assert config.output_attention == True
-                config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
+                config, unused_kwargs = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
                assert config.output_attention == True
                assert unused_kwargs == {'foo': False}

@@ -31,8 +31,8 @@ def tokenizer(*args, **kwargs):
        # Using torch.hub !
        import torch

-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+        tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
+        tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`

    """

@@ -45,13 +45,13 @@ def model(*args, **kwargs):
            # Using torch.hub !
            import torch

-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            model = torch.hub.load('huggingface/transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """

@@ -63,13 +63,13 @@ def modelWithLMHead(*args, **kwargs):
        # Using torch.hub !
        import torch

-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
        assert model.config.output_attention == True
        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

    """
    return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
@@ -81,13 +81,13 @@ def modelForSequenceClassification(*args, **kwargs):
            # Using torch.hub !
            import torch

-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """

@@ -100,13 +100,13 @@ def modelForQuestionAnswering(*args, **kwargs):
        # Using torch.hub !
        import torch

-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
        assert model.config.output_attention == True
        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

    """
    return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
--- a/pytorch_transformers/init.py
+++ b/pytorch_transformers/init.py
@@ -1,54 +0,0 @@
-__version__ = "1.2.0"
-from .tokenization_auto import AutoTokenizer
-from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
-from .tokenization_openai import OpenAIGPTTokenizer
-from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
-from .tokenization_gpt2 import GPT2Tokenizer
-from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
-from .tokenization_xlm import XLMTokenizer
-from .tokenization_roberta import RobertaTokenizer
-from .tokenization_distilbert import DistilBertTokenizer
-
-from .tokenization_utils import (PreTrainedTokenizer)
-
-from .modeling_auto import (AutoConfig, AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
-                            AutoModelWithLMHead)
-
-from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
-                            BertForMaskedLM, BertForNextSentencePrediction,
-                            BertForSequenceClassification, BertForMultipleChoice,
-                            BertForTokenClassification, BertForQuestionAnswering,
-                            load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                            BERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
-from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTPreTrainedModel, OpenAIGPTModel,
-                              OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
-                              load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                              OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
-                                  load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_gpt2 import (GPT2Config, GPT2PreTrainedModel, GPT2Model,
-                            GPT2LMHeadModel, GPT2DoubleHeadsModel,
-                            load_tf_weights_in_gpt2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                            GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_xlnet import (XLNetConfig,
-                             XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
-                             XLNetForSequenceClassification, XLNetForQuestionAnswering,
-                             load_tf_weights_in_xlnet, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                             XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel,
-                           XLMWithLMHeadModel, XLMForSequenceClassification,
-                           XLMForQuestionAnswering, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                           XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_roberta import (RobertaConfig, RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
-                               ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_distilbert import (DistilBertConfig, DistilBertForMaskedLM, DistilBertModel,
-                               DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
-                               DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
-                          PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
-
-from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
-                           WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
-
-from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -1,172 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import os
-import sys
-from io import open
-import tempfile
-import shutil
-import unittest
-
-if sys.version_info[0] == 2:
-    import cPickle as pickle
-
-    class TemporaryDirectory(object):
-        """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
-        def __enter__(self):
-            self.name = tempfile.mkdtemp()
-            return self.name
-        def __exit__(self, exc_type, exc_value, traceback):
-            shutil.rmtree(self.name)
-else:
-    import pickle
-    TemporaryDirectory = tempfile.TemporaryDirectory
-    unicode = str
-
-
-class CommonTestCases:
-
-    class CommonTokenizerTester(unittest.TestCase):
-
-        tokenizer_class = None
-
-        def setUp(self):
-            self.tmpdirname = tempfile.mkdtemp()
-
-        def tearDown(self):
-            shutil.rmtree(self.tmpdirname)
-
-        def get_tokenizer(self, **kwargs):
-            raise NotImplementedError
-
-        def get_input_output_texts(self):
-            raise NotImplementedError
-
-        def test_save_and_load_tokenizer(self):
-            # safety check on max_len default value so we are sure the test works
-            tokenizer = self.get_tokenizer()
-            self.assertNotEqual(tokenizer.max_len, 42)
-
-            # Now let's start the test
-            tokenizer = self.get_tokenizer(max_len=42)
-
-            before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
-
-            with TemporaryDirectory() as tmpdirname:
-                tokenizer.save_pretrained(tmpdirname)
-                tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
-
-                after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
-                self.assertListEqual(before_tokens, after_tokens)
-
-                self.assertEqual(tokenizer.max_len, 42)
-                tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43)
-                self.assertEqual(tokenizer.max_len, 43)
-
-        def test_pickle_tokenizer(self):
-            tokenizer = self.get_tokenizer()
-            self.assertIsNotNone(tokenizer)
-
-            text = u"Munich and Berlin are nice cities"
-            subwords = tokenizer.tokenize(text)
-
-            with TemporaryDirectory() as tmpdirname:
-
-                filename = os.path.join(tmpdirname, u"tokenizer.bin")
-                pickle.dump(tokenizer, open(filename, "wb"))
-
-                tokenizer_new = pickle.load(open(filename, "rb"))
-
-            subwords_loaded = tokenizer_new.tokenize(text)
-
-            self.assertListEqual(subwords, subwords_loaded)
-
-
-        def test_add_tokens_tokenizer(self):
-            tokenizer = self.get_tokenizer()
-
-            vocab_size = tokenizer.vocab_size
-            all_size = len(tokenizer)
-
-            self.assertNotEqual(vocab_size, 0)
-            self.assertEqual(vocab_size, all_size)
-
-            new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
-            added_toks = tokenizer.add_tokens(new_toks)
-            vocab_size_2 = tokenizer.vocab_size
-            all_size_2 = len(tokenizer)
-
-            self.assertNotEqual(vocab_size_2, 0)
-            self.assertEqual(vocab_size, vocab_size_2)
-            self.assertEqual(added_toks, len(new_toks))
-            self.assertEqual(all_size_2, all_size + len(new_toks))
-
-            tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l")
-            out_string = tokenizer.decode(tokens)
-
-            self.assertGreaterEqual(len(tokens), 4)
-            self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-            self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-
-            new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
-                          'pad_token': "<<<<<|||>|>>>>|>"}
-            added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
-            vocab_size_3 = tokenizer.vocab_size
-            all_size_3 = len(tokenizer)
-
-            self.assertNotEqual(vocab_size_3, 0)
-            self.assertEqual(vocab_size, vocab_size_3)
-            self.assertEqual(added_toks_2, len(new_toks_2))
-            self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-
-            tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
-            out_string = tokenizer.decode(tokens)
-
-            self.assertGreaterEqual(len(tokens), 6)
-            self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-            self.assertGreater(tokens[0], tokens[1])
-            self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-            self.assertGreater(tokens[-2], tokens[-3])
-            self.assertEqual(tokens[0], tokenizer.eos_token_id)
-            self.assertEqual(tokens[-2], tokenizer.pad_token_id)
-
-
-        def test_required_methods_tokenizer(self):
-            tokenizer = self.get_tokenizer()
-            input_text, output_text = self.get_input_output_texts()
-
-            tokens = tokenizer.tokenize(input_text)
-            ids = tokenizer.convert_tokens_to_ids(tokens)
-            ids_2 = tokenizer.encode(input_text)
-            self.assertListEqual(ids, ids_2)
-
-            tokens_2 = tokenizer.convert_ids_to_tokens(ids)
-            text_2 = tokenizer.decode(ids)
-
-            self.assertEqual(text_2, output_text)
-
-            self.assertNotEqual(len(tokens_2), 0)
-            self.assertIsInstance(text_2, (str, unicode))
-
-
-        def test_pretrained_model_lists(self):
-            weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
-            weights_lists_2 = []
-            for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
-                weights_lists_2.append(list(map_list.keys()))
-
-            for weights_list_2 in weights_lists_2:
-                self.assertListEqual(weights_list, weights_list_2)
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -0,0 +1,48 @@
+absl-py==0.8.0
+astor==0.8.0
+atomicwrites==1.3.0
+attrs==19.2.0
+boto3==1.9.243
+botocore==1.12.243
+certifi==2019.9.11
+chardet==3.0.4
+Click==7.0
+docutils==0.15.2
+gast==0.2.2
+google-pasta==0.1.7
+grpcio==1.24.1
+h5py==2.10.0
+idna==2.8
+importlib-metadata==0.23
+jmespath==0.9.4
+joblib==0.14.0
+Keras-Applications==1.0.8
+Keras-Preprocessing==1.1.0
+Markdown==3.1.1
+more-itertools==7.2.0
+numpy==1.17.2
+opt-einsum==3.1.0
+packaging==19.2
+pluggy==0.13.0
+protobuf==3.10.0
+py==1.8.0
+pyparsing==2.4.2
+pytest==5.2.1
+python-dateutil==2.8.0
+regex==2019.8.19
+requests==2.22.0
+s3transfer==0.2.1
+sacremoses==0.0.35
+sentencepiece==0.1.83
+six==1.12.0
+tensorboard==2.0.0
+tensorflow==2.0.0
+tensorflow-estimator==2.0.0
+termcolor==1.1.0
+torch==1.2.0
+tqdm==4.36.1
+urllib3==1.25.6
+wcwidth==0.1.7
+Werkzeug==0.16.0
+wrapt==1.11.2
+zipp==0.6.0
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,3 @@
-# PyTorch
-torch>=1.0.0
 # progress bars in model download and training scripts
 tqdm
 # Accessing files from S3 directly.
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/m

 To create the package for pypi.

-1. Change the version in __init__.py and setup.py.
+1. Change the version in __init__.py, setup.py as well as docs/source/conf.py.

 2. Commit these changes with the message: "Release: VERSION"

@@ -13,11 +13,11 @@ To create the package for pypi.
 4. Build both the sources and the wheel. Do not change anything in setup.py between
   creating the wheel and the source distribution (obviously).

-   For the wheel, run: "python setup.py bdist_wheel" in the top level allennlp directory.
+   For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
   (this will build a wheel for the python version you use to build it - make sure you use python 3.x).

   For the sources, run: "python setup.py sdist"
-   You should now have a /dist directory with both .whl and .tar.gz source versions of allennlp.
+   You should now have a /dist directory with both .whl and .tar.gz source versions.

 5. Check that everything looks correct by uploading the package to the pypi test server:

@@ -25,7 +25,7 @@ To create the package for pypi.
   (pypi suggest using twine as other methods upload files via plaintext.)

   Check that you can install it in a virtualenv by running:
-   pip install -i https://testpypi.python.org/pypi pytorch-transformers
+   pip install -i https://testpypi.python.org/pypi transformers

 6. Upload the final version to actual pypi:
   twine upload dist/* -r pypi
@@ -37,20 +37,19 @@ from io import open
 from setuptools import find_packages, setup

 setup(
-    name="pytorch_transformers",
-    version="1.2.0",
-    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors",
+    name="transformers",
+    version="2.1.0",
+    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
    author_email="thomas@huggingface.co",
-    description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",
+    description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
    long_description=open("README.md", "r", encoding='utf-8').read(),
    long_description_content_type="text/markdown",
-    keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU',
+    keywords='NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU',
    license='Apache',
-    url="https://github.com/huggingface/pytorch-transformers",
+    url="https://github.com/huggingface/transformers",
    packages=find_packages(exclude=["*.tests", "*.tests.*",
                                    "tests.*", "tests"]),
-    install_requires=['torch>=1.0.0',
-                      'numpy',
+    install_requires=['numpy',
                      'boto3',
                      'requests',
                      'tqdm',
@@ -59,7 +58,7 @@ setup(
                      'sacremoses'],
    entry_points={
      'console_scripts': [
-        "pytorch_transformers=pytorch_transformers.__main__:main",
+        "transformers=transformers.__main__:main",
      ]
    },
    # python_requires='>=3.5.0',
--- a/transformers/init.py
+++ b/transformers/init.py
@@ -0,0 +1,176 @@
+__version__ = "2.1.0"
+
+# Work around to update TensorFlow's absl.logging threshold which alters the
+# default Python logging output behavior when present.
+# see: https://github.com/abseil/abseil-py/issues/99
+# and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
+try:
+    import absl.logging
+    absl.logging.set_verbosity('info')
+    absl.logging.set_stderrthreshold('info')
+    absl.logging._warn_preinit_stderr = False
+except:
+    pass
+
+import logging
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+# Files and general utilities
+from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
+                         cached_path, add_start_docstrings, add_end_docstrings,
+                         WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME,
+                         is_tf_available, is_torch_available)
+
+from .data import (is_sklearn_available,
+                   InputExample, InputFeatures, DataProcessor,
+                   glue_output_modes, glue_convert_examples_to_features,
+                   glue_processors, glue_tasks_num_labels)
+
+if is_sklearn_available():
+    from .data import glue_compute_metrics
+
+# Tokenizers
+from .tokenization_utils import (PreTrainedTokenizer)
+from .tokenization_auto import AutoTokenizer
+from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+from .tokenization_openai import OpenAIGPTTokenizer
+from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
+from .tokenization_gpt2 import GPT2Tokenizer
+from .tokenization_ctrl import CTRLTokenizer
+from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
+from .tokenization_xlm import XLMTokenizer
+from .tokenization_roberta import RobertaTokenizer
+from .tokenization_distilbert import DistilBertTokenizer
+
+# Configurations
+from .configuration_utils import PretrainedConfig
+from .configuration_auto import AutoConfig
+from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+# Modeling
+if is_torch_available():
+    from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
+    from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
+                                AutoModelWithLMHead)
+
+    from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
+                                BertForMaskedLM, BertForNextSentencePrediction,
+                                BertForSequenceClassification, BertForMultipleChoice,
+                                BertForTokenClassification, BertForQuestionAnswering,
+                                load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
+                                OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
+                                load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
+                                    load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
+                                GPT2LMHeadModel, GPT2DoubleHeadsModel,
+                                load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_ctrl import (CTRLPreTrainedModel, CTRLModel,
+                                CTRLLMHeadModel,
+                                CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
+                                XLNetForSequenceClassification, XLNetForMultipleChoice,
+                                XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering,
+                                load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
+                            XLMWithLMHeadModel, XLMForSequenceClassification,
+                            XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
+                            XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_roberta import (RobertaForMaskedLM, RobertaModel,
+                                RobertaForSequenceClassification, RobertaForMultipleChoice,
+                                ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
+                                DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
+                                DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    # Optimization
+    from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
+                               WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
+
+
+# TensorFlow
+if is_tf_available():
+    from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary
+    from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
+                                   TFAutoModelWithLMHead)
+
+    from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings,
+                                   TFBertModel, TFBertForPreTraining,
+                                   TFBertForMaskedLM, TFBertForNextSentencePrediction,
+                                   TFBertForSequenceClassification, TFBertForMultipleChoice,
+                                   TFBertForTokenClassification, TFBertForQuestionAnswering,
+                                   load_bert_pt_weights_in_tf2,
+                                   TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    from .modeling_tf_gpt2 import (TFGPT2PreTrainedModel, TFGPT2MainLayer,
+                                   TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel,
+                                   load_gpt2_pt_weights_in_tf2,
+                                   TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    from .modeling_tf_openai import (TFOpenAIGPTPreTrainedModel, TFOpenAIGPTMainLayer,
+                                     TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel,
+                                     load_openai_gpt_pt_weights_in_tf2,
+                                     TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    from .modeling_tf_transfo_xl import (TFTransfoXLPreTrainedModel, TFTransfoXLMainLayer,
+                                         TFTransfoXLModel, TFTransfoXLLMHeadModel,
+                                         load_transfo_xl_pt_weights_in_tf2,
+                                         TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
+                                    TFXLNetModel, TFXLNetLMHeadModel,
+                                    TFXLNetForSequenceClassification,
+                                    TFXLNetForQuestionAnsweringSimple,
+                                    load_xlnet_pt_weights_in_tf2,
+                                    TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    from .modeling_tf_xlm import (TFXLMPreTrainedModel, TFXLMMainLayer,
+                                  TFXLMModel, TFXLMWithLMHeadModel,
+                                  TFXLMForSequenceClassification,
+                                  TFXLMForQuestionAnsweringSimple,
+                                  load_xlm_pt_weights_in_tf2,
+                                  TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    from .modeling_tf_roberta import (TFRobertaPreTrainedModel, TFRobertaMainLayer,
+                                      TFRobertaModel, TFRobertaForMaskedLM,
+                                      TFRobertaForSequenceClassification,
+                                      load_roberta_pt_weights_in_tf2,
+                                      TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
+                                         TFDistilBertModel, TFDistilBertForMaskedLM,
+                                         TFDistilBertForSequenceClassification,
+                                         TFDistilBertForQuestionAnswering,
+                                         load_distilbert_pt_weights_in_tf2,
+                                         TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    from .modeling_tf_ctrl import (TFCTRLPreTrainedModel, TFCTRLModel,
+                                    TFCTRLLMHeadModel,
+                                    load_ctrl_pt_weights_in_tf2,
+                                    TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+# TF 2.0 <=> PyTorch conversion utilities
+if is_tf_available() and is_torch_available():
+    from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
+                                            load_pytorch_checkpoint_in_tf2_model,
+                                            load_pytorch_weights_in_tf2_model,
+                                            load_pytorch_model_in_tf2_model,
+                                            load_tf2_checkpoint_in_pytorch_model,
+                                            load_tf2_weights_in_pytorch_model,
+                                            load_tf2_model_in_pytorch_model)
+
+if not is_tf_available() and not is_torch_available():
+    logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found."
+                   "Models won't be available and only tokenizers, configuration"
+                   "and file/data utilities can be used.")
--- a/pytorch_transformers/main.py
+++ b/pytorch_transformers/main.py
@@ -3,36 +3,37 @@ def main():
    import sys
    if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
        print(
-        "Should be used as one of: \n"
-        ">> pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
-        ">> pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
-        ">> pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
-        ">> pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
-        ">> pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
-        ">> pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
+        "This command line utility let you convert original (author released) model checkpoint to pytorch.\n"
+        "It should be used as one of: \n"
+        ">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
+        ">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
+        ">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
+        ">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
+        ">> transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
+        ">> transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
    else:
        if sys.argv[1] == "bert":
            try:
-                from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+                from .convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
            except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                    "In that case, it requires TensorFlow to be installed. Please see "
                    "https://www.tensorflow.org/install/ for installation instructions.")
                raise

            if len(sys.argv) != 5:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+                print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
            else:
                PYTORCH_DUMP_OUTPUT = sys.argv.pop()
                TF_CONFIG = sys.argv.pop()
                TF_CHECKPOINT = sys.argv.pop()
                convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
        elif sys.argv[1] == "gpt":
-            from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
+            from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
            if len(sys.argv) < 4 or len(sys.argv) > 5:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
+                print("Should be used as `transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
            else:
                OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
                PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -45,15 +46,15 @@ def main():
                                                    PYTORCH_DUMP_OUTPUT)
        elif sys.argv[1] == "transfo_xl":
            try:
-                from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
+                from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
            except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                    "In that case, it requires TensorFlow to be installed. Please see "
                    "https://www.tensorflow.org/install/ for installation instructions.")
                raise
            if len(sys.argv) < 4 or len(sys.argv) > 5:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+                print("Should be used as `transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
            else:
                if 'ckpt' in sys.argv[2].lower():
                    TF_CHECKPOINT = sys.argv[2]
@@ -69,16 +70,16 @@ def main():
                convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
        elif sys.argv[1] == "gpt2":
            try:
-                from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
+                from .convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
            except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                    "In that case, it requires TensorFlow to be installed. Please see "
                    "https://www.tensorflow.org/install/ for installation instructions.")
                raise

            if len(sys.argv) < 4 or len(sys.argv) > 5:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+                print("Should be used as `transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
            else:
                TF_CHECKPOINT = sys.argv[2]
                PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -89,16 +90,16 @@ def main():
                convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
        elif sys.argv[1] == "xlnet":
            try:
-                from .convert_xlnet_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
+                from .convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
            except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                    "In that case, it requires TensorFlow to be installed. Please see "
                    "https://www.tensorflow.org/install/ for installation instructions.")
                raise

            if len(sys.argv) < 5 or len(sys.argv) > 6:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
+                print("Should be used as `transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
            else:
                TF_CHECKPOINT = sys.argv[2]
                TF_CONFIG = sys.argv[3]
@@ -113,11 +114,11 @@ def main():
                                                    PYTORCH_DUMP_OUTPUT,
                                                    FINETUNING_TASK)
        elif sys.argv[1] == "xlm":
-            from .convert_xlm_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
+            from .convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch

            if len(sys.argv) != 4:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
+                print("Should be used as `transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
            else:
                XLM_CHECKPOINT_PATH = sys.argv[2]
                PYTORCH_DUMP_OUTPUT = sys.argv[3]
--- a/transformers/configuration_auto.py
+++ b/transformers/configuration_auto.py
@@ -0,0 +1,137 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Model class. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+
+from .configuration_bert import BertConfig
+from .configuration_openai import OpenAIGPTConfig
+from .configuration_gpt2 import GPT2Config
+from .configuration_transfo_xl import TransfoXLConfig
+from .configuration_xlnet import XLNetConfig
+from .configuration_xlm import XLMConfig
+from .configuration_roberta import RobertaConfig
+from .configuration_distilbert import DistilBertConfig
+from .configuration_ctrl import CTRLConfig
+
+logger = logging.getLogger(__name__)
+
+
+class AutoConfig(object):
+    r""":class:`~transformers.AutoConfig` is a generic configuration class
+        that will be instantiated as one of the configuration classes of the library
+        when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method take care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertConfig (DistilBERT model)
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+            - contains `roberta`: RobertaConfig (RoBERTa model)
+            - contains `ctrl` : CTRLConfig (CTRL model)
+        This class cannot be instantiated using `__init__()` (throw an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoConfig is designed to be instantiated "
+            "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r""" Instantiate a one of the configuration classes of the library
+        from a pre-trained model configuration.
+
+        The configuration class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertConfig (DistilBERT model)
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+            - contains `roberta`: RobertaConfig (RoBERTa model)
+            - contains `ctrl` : CTRLConfig (CTRL model)
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
+
+                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            return_unused_kwargs: (`optional`) bool:
+
+                - If False, then this function returns just the final configuration object.
+                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
+
+        Examples::
+
+            config = AutoConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            config = AutoConfig.from_pretrained('./test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+            config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
+            config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+            assert config.output_attention == True
+            config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True,
+                                                               foo=False, return_unused_kwargs=True)
+            assert config.output_attention == True
+            assert unused_kwargs == {'foo': False}
+
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'ctrl' in pretrained_model_name_or_path:
+            return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
--- a/transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -0,0 +1,113 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT model configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
+    'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
+}
+
+
+class BertConfig(PretrainedConfig):
+    r"""
+        :class:`~transformers.BertConfig` is the configuration class to store the configuration of a
+        `BertModel`.
+
+
+        Arguments:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
+    """
+    pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 **kwargs):
+        super(BertConfig, self).__init__(**kwargs)
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+            self.layer_norm_eps = layer_norm_eps
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             " or the path to a pretrained model config file (str)")
--- a/transformers/configuration_ctrl.py
+++ b/transformers/configuration_ctrl.py
@@ -0,0 +1,143 @@
+# coding=utf-8
+# Copyright 2018 Salesforce and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Salesforce CTRL configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/ctrl-config.json"}
+
+class CTRLConfig(PretrainedConfig):
+    """Configuration class to store the configuration of a `CTRLModel`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
+        n_positions: Number of positional embeddings.
+        n_ctx: Size of the causal mask (usually same as n_positions).
+        dff: Size of the inner dimension of the FFN.
+        n_embd: Dimensionality of the embeddings and hidden states.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        layer_norm_epsilon: epsilon to use in the layer norm layers
+        resid_pdrop: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attn_pdrop: The dropout ratio for the attention
+            probabilities.
+        embd_pdrop: The dropout ratio for the embeddings.
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+    """
+    pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(
+        self,
+        vocab_size_or_config_json_file=246534,
+        n_positions=256,
+        n_ctx=256,
+        n_embd=1280,
+        dff=8192,
+        n_layer=48,
+        n_head=16,
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-6,
+        initializer_range=0.02,
+
+        num_labels=1,
+        summary_type='cls_index',
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        **kwargs
+    ):
+        """Constructs CTRLConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
+            n_positions: Number of positional embeddings.
+            n_ctx: Size of the causal mask (usually same as n_positions).
+            dff: Size of the inner dimension of the FFN.
+            n_embd: Dimensionality of the embeddings and hidden states.
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            layer_norm_epsilon: epsilon to use in the layer norm layers
+            resid_pdrop: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attn_pdrop: The dropout ratio for the attention
+                probabilities.
+            embd_pdrop: The dropout ratio for the embeddings.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        super(CTRLConfig, self).__init__(**kwargs)
+
+        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
+        self.n_ctx = n_ctx
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.dff = dff
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+
+        self.num_labels = num_labels
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_first_dropout = summary_first_dropout
+        self.summary_proj_to_labels = summary_proj_to_labels
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif not isinstance(vocab_size_or_config_json_file, int):
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    @property
+    def max_position_embeddings(self):
+        return self.n_positions
+
+    @property
+    def hidden_size(self):
+        return self.n_embd
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
--- a/transformers/configuration_distilbert.py
+++ b/transformers/configuration_distilbert.py
@@ -0,0 +1,89 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DistilBERT model configuration """
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import sys
+import json
+import logging
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
+    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
+}
+
+
+class DistilBertConfig(PretrainedConfig):
+    pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=30522,
+                 max_position_embeddings=512,
+                 sinusoidal_pos_embds=False,
+                 n_layers=6,
+                 n_heads=12,
+                 dim=768,
+                 hidden_dim=4*768,
+                 dropout=0.1,
+                 attention_dropout=0.1,
+                 activation='gelu',
+                 initializer_range=0.02,
+                 tie_weights_=True,
+                 qa_dropout=0.1,
+                 seq_classif_dropout=0.2,
+                 **kwargs):
+        super(DistilBertConfig, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.max_position_embeddings = max_position_embeddings
+            self.sinusoidal_pos_embds = sinusoidal_pos_embds
+            self.n_layers = n_layers
+            self.n_heads = n_heads
+            self.dim = dim
+            self.hidden_dim = hidden_dim
+            self.dropout = dropout
+            self.attention_dropout = attention_dropout
+            self.activation = activation
+            self.initializer_range = initializer_range
+            self.tie_weights_ = tie_weights_
+            self.qa_dropout = qa_dropout
+            self.seq_classif_dropout = seq_classif_dropout
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             " or the path to a pretrained model config file (str)")
+    @property
+    def hidden_size(self):
+        return self.dim
+
+    @property
+    def num_attention_heads(self):
+        return self.n_heads
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layers
--- a/transformers/configuration_gpt2.py
+++ b/transformers/configuration_gpt2.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" OpenAI GPT-2 configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
+                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
+                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
+                                      "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",}
+
+class GPT2Config(PretrainedConfig):
+    """Configuration class to store the configuration of a `GPT2Model`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+        n_positions: Number of positional embeddings.
+        n_ctx: Size of the causal mask (usually same as n_positions).
+        n_embd: Dimensionality of the embeddings and hidden states.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        layer_norm_epsilon: epsilon to use in the layer norm layers
+        resid_pdrop: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attn_pdrop: The dropout ratio for the attention
+            probabilities.
+        embd_pdrop: The dropout ratio for the embeddings.
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+    """
+    pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(
+        self,
+        vocab_size_or_config_json_file=50257,
+        n_positions=1024,
+        n_ctx=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+
+        num_labels=1,
+        summary_type='cls_index',
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        **kwargs
+    ):
+        """Constructs GPT2Config.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+            n_positions: Number of positional embeddings.
+            n_ctx: Size of the causal mask (usually same as n_positions).
+            n_embd: Dimensionality of the embeddings and hidden states.
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            layer_norm_epsilon: epsilon to use in the layer norm layers
+            resid_pdrop: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attn_pdrop: The dropout ratio for the attention
+                probabilities.
+            embd_pdrop: The dropout ratio for the embeddings.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        super(GPT2Config, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.n_ctx = n_ctx
+            self.n_positions = n_positions
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.resid_pdrop = resid_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.attn_pdrop = attn_pdrop
+            self.layer_norm_epsilon = layer_norm_epsilon
+            self.initializer_range = initializer_range
+
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_first_dropout = summary_first_dropout
+            self.summary_proj_to_labels = summary_proj_to_labels
+        else:
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    @property
+    def max_position_embeddings(self):
+        return self.n_positions
+
+    @property
+    def hidden_size(self):
+        return self.n_embd
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
--- a/transformers/configuration_openai.py
+++ b/transformers/configuration_openai.py
@@ -0,0 +1,134 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" OpenAI GPT configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"
+}
+
+class OpenAIGPTConfig(PretrainedConfig):
+    """
+    Configuration class to store the configuration of a `OpenAIGPTModel`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
+        n_positions: Number of positional embeddings.
+        n_ctx: Size of the causal mask (usually same as n_positions).
+        n_embd: Dimensionality of the embeddings and hidden states.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        afn: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        resid_pdrop: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attn_pdrop: The dropout ratio for the attention
+            probabilities.
+        embd_pdrop: The dropout ratio for the embeddings.
+        layer_norm_epsilon: epsilon to use in the layer norm layers
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        predict_special_tokens: should we predict special tokens (when the model has a LM head)
+    """
+    pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(
+        self,
+        vocab_size_or_config_json_file=40478,
+        n_positions=512,
+        n_ctx=512,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        afn="gelu",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        predict_special_tokens=True,
+
+        num_labels=1,
+        summary_type='cls_index',
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        **kwargs
+    ):
+        """Constructs OpenAIGPTConfig.
+        """
+        super(OpenAIGPTConfig, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.n_ctx = n_ctx
+            self.n_positions = n_positions
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.afn = afn
+            self.resid_pdrop = resid_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.attn_pdrop = attn_pdrop
+            self.layer_norm_epsilon = layer_norm_epsilon
+            self.initializer_range = initializer_range
+            self.predict_special_tokens = predict_special_tokens
+
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_first_dropout = summary_first_dropout
+            self.summary_proj_to_labels = summary_proj_to_labels
+        else:
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    @property
+    def max_position_embeddings(self):
+        return self.n_positions
+
+    @property
+    def hidden_size(self):
+        return self.n_embd
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
--- a/transformers/configuration_roberta.py
+++ b/transformers/configuration_roberta.py
@@ -0,0 +1,35 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" RoBERTa configuration """
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+
+from .configuration_bert import BertConfig
+
+logger = logging.getLogger(__name__)
+
+ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
+    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
+    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
+}
+
+
+class RobertaConfig(BertConfig):
+    pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
--- a/transformers/configuration_transfo_xl.py
+++ b/transformers/configuration_transfo_xl.py
@@ -0,0 +1,168 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Transformer XL configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
+}
+
+class TransfoXLConfig(PretrainedConfig):
+    """Configuration class to store the configuration of a `TransfoXLModel`.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
+            cutoffs: cutoffs for the adaptive softmax
+            d_model: Dimensionality of the model's hidden states.
+            d_embed: Dimensionality of the embeddings
+            d_head: Dimensionality of the model's heads.
+            div_val: divident value for adapative input and softmax
+            pre_lnorm: apply LayerNorm to the input instead of the output
+            d_inner: Inner dimension in FF
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            tgt_len: number of tokens to predict
+            ext_len: length of the extended context
+            mem_len: length of the retained previous heads
+            same_length: use the same attn length for all tokens
+            proj_share_all_but_first: True to share all but first projs, False not to share.
+            attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
+            clamp_len: use the same pos embeddings after clamp_len
+            sample_softmax: number of samples in sampled softmax
+            adaptive: use adaptive softmax
+            tie_weight: tie the word embedding and softmax weights
+            dropout: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            dropatt: The dropout ratio for the attention probabilities.
+            untie_r: untie relative position biases
+            embd_pdrop: The dropout ratio for the embeddings.
+            init: parameter initializer to use
+            init_range: parameters initialized by U(-init_range, init_range).
+            proj_init_std: parameters initialized by N(0, init_std)
+            init_std: parameters initialized by N(0, init_std)
+    """
+    pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=267735,
+                 cutoffs=[20000, 40000, 200000],
+                 d_model=1024,
+                 d_embed=1024,
+                 n_head=16,
+                 d_head=64,
+                 d_inner=4096,
+                 div_val=4,
+                 pre_lnorm=False,
+                 n_layer=18,
+                 tgt_len=128,
+                 ext_len=0,
+                 mem_len=1600,
+                 clamp_len=1000,
+                 same_length=True,
+                 proj_share_all_but_first=True,
+                 attn_type=0,
+                 sample_softmax=-1,
+                 adaptive=True,
+                 tie_weight=True,
+                 dropout=0.1,
+                 dropatt=0.0,
+                 untie_r=True,
+                 init="normal",
+                 init_range=0.01,
+                 proj_init_std=0.01,
+                 init_std=0.02,
+                 layer_norm_epsilon=1e-5,
+                 **kwargs):
+        """Constructs TransfoXLConfig.
+        """
+        super(TransfoXLConfig, self).__init__(**kwargs)
+        self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
+        self.cutoffs = []
+        self.cutoffs.extend(cutoffs)
+        self.tie_weight = tie_weight
+        if proj_share_all_but_first:
+            self.tie_projs = [False] + [True] * len(self.cutoffs)
+        else:
+            self.tie_projs = [False] + [False] * len(self.cutoffs)
+        self.d_model = d_model
+        self.d_embed = d_embed
+        self.d_head = d_head
+        self.d_inner = d_inner
+        self.div_val = div_val
+        self.pre_lnorm = pre_lnorm
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.tgt_len = tgt_len
+        self.ext_len = ext_len
+        self.mem_len = mem_len
+        self.same_length = same_length
+        self.attn_type = attn_type
+        self.clamp_len = clamp_len
+        self.sample_softmax = sample_softmax
+        self.adaptive = adaptive
+        self.dropout = dropout
+        self.dropatt = dropatt
+        self.untie_r = untie_r
+        self.init = init
+        self.init_range = init_range
+        self.proj_init_std = proj_init_std
+        self.init_std = init_std
+        self.layer_norm_epsilon = layer_norm_epsilon
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif not isinstance(vocab_size_or_config_json_file, int):
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             " or the path to a pretrained model config file (str)")
+
+    @property
+    def max_position_embeddings(self):
+        return self.tgt_len + self.ext_len + self.mem_len
+
+    @property
+    def vocab_size(self):
+        return self.n_token
+
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.n_token = value
+
+    @property
+    def hidden_size(self):
+        return self.d_model
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
--- a/transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -0,0 +1,207 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Configuration base class and utilities."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import copy
+import json
+import logging
+import os
+from io import open
+
+from .file_utils import cached_path, CONFIG_NAME
+
+logger = logging.getLogger(__name__)
+
+class PretrainedConfig(object):
+    r""" Base class for all configuration classes.
+        Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
+
+        Note:
+            A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
+            It only affects the model's configuration.
+
+        Class attributes (overridden by derived classes):
+            - ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
+
+        Parameters:
+            ``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
+            ``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
+            ``output_attentions``: boolean, default `False`. Should the model returns attentions weights.
+            ``output_hidden_states``: string, default `False`. Should the model returns all hidden-states.
+            ``torchscript``: string, default `False`. Is the model used with Torchscript.
+    """
+    pretrained_config_archive_map = {}
+
+    def __init__(self, **kwargs):
+        self.finetuning_task = kwargs.pop('finetuning_task', None)
+        self.num_labels = kwargs.pop('num_labels', 2)
+        self.output_attentions = kwargs.pop('output_attentions', False)
+        self.output_hidden_states = kwargs.pop('output_hidden_states', False)
+        self.torchscript = kwargs.pop('torchscript', False)
+        self.use_bfloat16 = kwargs.pop('use_bfloat16', False)
+        self.pruned_heads = kwargs.pop('pruned_heads', {})
+
+    def save_pretrained(self, save_directory):
+        """ Save a configuration object to the directory `save_directory`, so that it
+            can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
+        """
+        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_config_file = os.path.join(save_directory, CONFIG_NAME)
+
+        self.to_json_file(output_config_file)
+        logger.info("Configuration saved in {}".format(output_config_file))
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r""" Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
+
+        Parameters:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
+
+                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            return_unused_kwargs: (`optional`) bool:
+
+                - If False, then this function returns just the final configuration object.
+                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
+
+        Examples::
+
+            # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
+            # derived class: BertConfig
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+            config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
+            config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+            assert config.output_attention == True
+            config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
+                                                               foo=False, return_unused_kwargs=True)
+            assert config.output_attention == True
+            assert unused_kwargs == {'foo': False}
+
+        """
+        cache_dir = kwargs.pop('cache_dir', None)
+        force_download = kwargs.pop('force_download', False)
+        proxies = kwargs.pop('proxies', None)
+        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
+
+        if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
+            config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
+        elif os.path.isdir(pretrained_model_name_or_path):
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        else:
+            config_file = pretrained_model_name_or_path
+        # redirect to the cache, if necessary
+        try:
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+        except EnvironmentError as e:
+            if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(cls.pretrained_config_archive_map.keys()),
+                        config_file))
+            raise e
+        if resolved_config_file == config_file:
+            logger.info("loading configuration file {}".format(config_file))
+        else:
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+
+        # Load config
+        config = cls.from_json_file(resolved_config_file)
+
+        if hasattr(config, 'pruned_heads'):
+            config.pruned_heads = dict((int(key), set(value)) for key, value in config.pruned_heads.items())
+
+        # Update config with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info("Model config %s", config)
+        if return_unused_kwargs:
+            return config, kwargs
+        else:
+            return config
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `Config` from a Python dictionary of parameters."""
+        config = cls(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            setattr(config, key, value)
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __eq__(self, other):
+        return self.__dict__ == other.__dict__
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
--- a/transformers/configuration_xlm.py
+++ b/transformers/configuration_xlm.py
@@ -0,0 +1,181 @@
+# coding=utf-8
+# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XLM configuration """
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
+    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json",
+    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json",
+    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json",
+    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json",
+    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
+    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
+    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
+    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
+    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json",
+}
+
+
+class XLMConfig(PretrainedConfig):
+    """Configuration class to store the configuration of a `XLMModel`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
+        d_model: Size of the encoder layers and the pooler layer.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        d_inner: The size of the "intermediate" (i.e., feed-forward)
+            layer in the Transformer encoder.
+        ff_activation: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        untie_r: untie relative position biases
+        attn_type: 'bi' for XLM, 'uni' for Transformer-XL
+
+        dropout: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        max_position_embeddings: The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_eps: The epsilon used by LayerNorm.
+
+        dropout: float, dropout rate.
+        init: str, the initialization scheme, either "normal" or "uniform".
+        init_range: float, initialize the parameters with a uniform distribution
+            in [-init_range, init_range]. Only effective when init="uniform".
+        init_std: float, initialize the parameters with a normal distribution
+            with mean 0 and stddev init_std. Only effective when init="normal".
+        mem_len: int, the number of tokens to cache.
+        reuse_len: int, the number of tokens in the currect batch to be cached
+            and reused in the future.
+        bi_data: bool, whether to use bidirectional input pipeline.
+            Usually set to True during pretraining and False during finetuning.
+        clamp_len: int, clamp all relative distances larger than clamp_len.
+            -1 means no clamping.
+        same_length: bool, whether to use the same attention length for each token.
+    """
+    pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=30145,
+                 emb_dim=2048,
+                 n_layers=12,
+                 n_heads=16,
+                 dropout=0.1,
+                 attention_dropout=0.1,
+                 gelu_activation=True,
+                 sinusoidal_embeddings=False,
+                 causal=False,
+                 asm=False,
+                 n_langs=1,
+                 use_lang_emb=True,
+                 max_position_embeddings=512,
+                 embed_init_std=2048 ** -0.5,
+                 layer_norm_eps=1e-12,
+                 init_std=0.02,
+                 bos_index=0,
+                 eos_index=1,
+                 pad_index=2,
+                 unk_index=3,
+                 mask_index=5,
+                 is_encoder=True,
+
+                 finetuning_task=None,
+                 num_labels=2,
+                 summary_type='first',
+                 summary_use_proj=True,
+                 summary_activation=None,
+                 summary_proj_to_labels=True,
+                 summary_first_dropout=0.1,
+                 start_n_top=5,
+                 end_n_top=5,
+                 **kwargs):
+        """Constructs XLMConfig.
+        """
+        super(XLMConfig, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.n_words = vocab_size_or_config_json_file
+            self.emb_dim = emb_dim
+            self.n_layers = n_layers
+            self.n_heads = n_heads
+            self.dropout = dropout
+            self.attention_dropout = attention_dropout
+            self.gelu_activation = gelu_activation
+            self.sinusoidal_embeddings = sinusoidal_embeddings
+            self.causal = causal
+            self.asm = asm
+            self.n_langs = n_langs
+            self.use_lang_emb = use_lang_emb
+            self.layer_norm_eps = layer_norm_eps
+            self.bos_index = bos_index
+            self.eos_index = eos_index
+            self.pad_index = pad_index
+            self.unk_index = unk_index
+            self.mask_index = mask_index
+            self.is_encoder = is_encoder
+            self.max_position_embeddings = max_position_embeddings
+            self.embed_init_std = embed_init_std
+            self.init_std = init_std
+            self.finetuning_task = finetuning_task
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_proj_to_labels = summary_proj_to_labels
+            self.summary_first_dropout = summary_first_dropout
+            self.start_n_top = start_n_top
+            self.end_n_top = end_n_top
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             " or the path to a pretrained model config file (str)")
+
+    @property
+    def vocab_size(self):
+        return self.n_words
+
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.n_words = value
+
+    @property
+    def hidden_size(self):
+        return self.emb_dim
+
+    @property
+    def num_attention_heads(self):
+        return self.n_heads
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layers
--- a/transformers/configuration_xlnet.py
+++ b/transformers/configuration_xlnet.py
@@ -0,0 +1,170 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XLNet configuration """
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
+    'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
+}
+
+
+class XLNetConfig(PretrainedConfig):
+    """Configuration class to store the configuration of a ``XLNetModel``.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
+        d_model: Size of the encoder layers and the pooler layer.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        d_inner: The size of the "intermediate" (i.e., feed-forward)
+            layer in the Transformer encoder.
+        ff_activation: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        untie_r: untie relative position biases
+        attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
+
+        dropout: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_eps: The epsilon used by LayerNorm.
+
+        dropout: float, dropout rate.
+        init: str, the initialization scheme, either "normal" or "uniform".
+        init_range: float, initialize the parameters with a uniform distribution
+            in [-init_range, init_range]. Only effective when init="uniform".
+        init_std: float, initialize the parameters with a normal distribution
+            with mean 0 and stddev init_std. Only effective when init="normal".
+        mem_len: int, the number of tokens to cache.
+        reuse_len: int, the number of tokens in the currect batch to be cached
+            and reused in the future.
+        bi_data: bool, whether to use bidirectional input pipeline.
+            Usually set to True during pretraining and False during finetuning.
+        clamp_len: int, clamp all relative distances larger than clamp_len.
+            -1 means no clamping.
+        same_length: bool, whether to use the same attention length for each token.
+        finetuning_task: name of the glue task on which the model was fine-tuned if any
+    """
+    pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=32000,
+                 d_model=1024,
+                 n_layer=24,
+                 n_head=16,
+                 d_inner=4096,
+                 max_position_embeddings=512,
+                 ff_activation="gelu",
+                 untie_r=True,
+                 attn_type="bi",
+
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+
+                 dropout=0.1,
+                 mem_len=None,
+                 reuse_len=None,
+                 bi_data=False,
+                 clamp_len=-1,
+                 same_length=False,
+
+                 finetuning_task=None,
+                 num_labels=2,
+                 summary_type='last',
+                 summary_use_proj=True,
+                 summary_activation='tanh',
+                 summary_last_dropout=0.1,
+                 start_n_top=5,
+                 end_n_top=5,
+                 **kwargs):
+        """Constructs XLNetConfig.
+        """
+        super(XLNetConfig, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                setattr(config, key, value)
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.n_token = vocab_size_or_config_json_file
+            self.d_model = d_model
+            self.n_layer = n_layer
+            self.n_head = n_head
+            assert d_model % n_head == 0
+            self.d_head = d_model // n_head
+            self.ff_activation = ff_activation
+            self.d_inner = d_inner
+            self.untie_r = untie_r
+            self.attn_type = attn_type
+
+            self.initializer_range = initializer_range
+            self.layer_norm_eps = layer_norm_eps
+
+            self.dropout = dropout
+            self.mem_len = mem_len
+            self.reuse_len = reuse_len
+            self.bi_data = bi_data
+            self.clamp_len = clamp_len
+            self.same_length = same_length
+
+            self.finetuning_task = finetuning_task
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_last_dropout = summary_last_dropout
+            self.start_n_top = start_n_top
+            self.end_n_top = end_n_top
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             " or the path to a pretrained model config file (str)")
+
+    @property
+    def max_position_embeddings(self):
+        return -1
+
+    @property
+    def vocab_size(self):
+        return self.n_token
+
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.n_token = value
+
+    @property
+    def hidden_size(self):
+        return self.d_model
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
--- a/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import argparse
 import torch

-from pytorch_transformers.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert

 import logging
 logging.basicConfig(level=logging.INFO)
--- a/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
+++ b/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
@@ -20,7 +20,7 @@ import argparse
 import torch
 import numpy as np
 import tensorflow as tf
-from pytorch_transformers.modeling import BertModel
+from transformers import BertModel


 def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
--- a/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from io import open

 import torch

-from pytorch_transformers.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
+from transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                     GPT2Config,
                                                     GPT2Model,
                                                     load_tf_weights_in_gpt2)
--- a/Show More
+++ b/Show More