diff --git a/.circleci/config.yml b/.circleci/config.yml
index 01e6d82b33..bfa3b943aa 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,9 +1,11 @@
 version: 2
 jobs:
-    build_py3_torch_and_tf:
+    run_tests_py3_torch_and_tf:
         working_directory: ~/transformers
         docker:
             - image: circleci/python:3.5
+        environment:
+            OMP_NUM_THREADS: 1
         resource_class: xlarge
         parallelism: 1
         steps:
@@ -11,65 +13,67 @@ jobs:
             - run: sudo pip install torch
             - run: sudo pip install tensorflow
             - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo pip install pytest codecov pytest-cov pytest-xdist
             - run: sudo pip install tensorboardX scikit-learn
-            - run: python -m pytest -sv ./transformers/tests/ --cov
+            - run: python -m pytest -n 8 --dist=loadfile -s -v ./transformers/tests/ --cov
             - run: codecov
-    build_py3_torch:
+    run_tests_py3_torch:
         working_directory: ~/transformers
         docker:
             - image: circleci/python:3.5
+        environment:
+            OMP_NUM_THREADS: 1
         resource_class: xlarge
         parallelism: 1
         steps:
             - checkout
             - run: sudo pip install torch
             - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo pip install pytest codecov pytest-cov pytest-xdist
             - run: sudo pip install tensorboardX scikit-learn
-            - run: python -m pytest -sv ./transformers/tests/ --cov
-            - run: python -m pytest -sv ./examples/
+            - run: python -m pytest -n 8 --dist=loadfile -s -v ./transformers/tests/ --cov
             - run: codecov
-    build_py3_tf:
+    run_tests_py3_tf:
         working_directory: ~/transformers
         docker:
             - image: circleci/python:3.5
+        environment:
+            OMP_NUM_THREADS: 1
         resource_class: xlarge
         parallelism: 1
         steps:
             - checkout
             - run: sudo pip install tensorflow
             - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo pip install pytest codecov pytest-cov pytest-xdist
             - run: sudo pip install tensorboardX scikit-learn
-            - run: python -m pytest -sv ./transformers/tests/ --cov
+            - run: python -m pytest -n 8 --dist=loadfile -s -v ./transformers/tests/ --cov
             - run: codecov
-    build_py2_torch:
+    run_tests_py3_custom_tokenizers:
         working_directory: ~/transformers
-        resource_class: large
-        parallelism: 1
         docker:
-            - image: circleci/python:2.7
+            - image: circleci/python:3.5
+        steps:
+            - checkout
+            - run: sudo pip install --progress-bar off .
+            - run: sudo pip install pytest pytest-xdist
+            - run: sudo pip install mecab-python3
+            - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
+    run_examples_py3_torch:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.5
+        environment:
+            OMP_NUM_THREADS: 1
+        resource_class: xlarge
+        parallelism: 1
         steps:
             - checkout
             - run: sudo pip install torch
             - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest codecov pytest-cov
-            - run: python -m pytest -sv ./transformers/tests/ --cov
-            - run: codecov
-    build_py2_tf:
-        working_directory: ~/transformers
-        resource_class: large
-        parallelism: 1
-        docker:
-            - image: circleci/python:2.7
-        steps:
-            - checkout
-            - run: sudo pip install tensorflow
-            - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest codecov pytest-cov
-            - run: python -m pytest -sv ./transformers/tests/ --cov
-            - run: codecov
+            - run: sudo pip install pytest pytest-xdist
+            - run: sudo pip install tensorboardX scikit-learn
+            - run: python -m pytest -n 8 --dist=loadfile -s -v ./examples/
     deploy_doc:
         working_directory: ~/transformers
         docker:
@@ -82,6 +86,16 @@ jobs:
             - run: sudo pip install --progress-bar off -r docs/requirements.txt
             - run: sudo pip install --progress-bar off -r requirements.txt
             - run: ./.circleci/deploy.sh
+    check_repository_consistency:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.5
+        resource_class: small
+        parallelism: 1
+        steps:
+            - checkout
+            - run: sudo pip install requests
+            - run: python ./utils/link_tester.py
 workflow_filters: &workflow_filters
     filters:
         branches:
@@ -91,9 +105,10 @@ workflows:
     version: 2
     build_and_test:
         jobs:
-            - build_py3_torch_and_tf
-            - build_py3_torch
-            - build_py3_tf
-            - build_py2_torch
-            - build_py2_tf
+            - check_repository_consistency
+            - run_examples_py3_torch
+            - run_tests_py3_custom_tokenizers
+            - run_tests_py3_torch_and_tf
+            - run_tests_py3_torch
+            - run_tests_py3_tf
             - deploy_doc: *workflow_filters
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8228dd59d8..7d7f2c73ff 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -168,7 +168,7 @@ Follow these steps to start contributing:
    to be merged;
 4. Make sure pre-existing tests still pass;
 5. Add high-coverage tests. No quality test, no merge;
-6. All public methods must have informative doctrings;
+6. All public methods must have informative docstrings;
 
 
 ### Style guide
diff --git a/README.md b/README.md
index f3aa8a95ee..416adcc1ef 100644
--- a/README.md
+++ b/README.md
@@ -55,10 +55,12 @@ Choose the right framework for every part of a model's lifetime
 | [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
 | [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
 | [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
+| [Quick tour: pipelines](#quick-tour-of-pipelines) | Using Pipelines: Wrapper around tokenizer and models to use finetuned models |
 | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
+| [Quick tour: Share your models ](#Quick-tour-of-model-sharing) | Upload and share your fine-tuned models with the community |
 | [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
 | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
-| [Documentation][(v2.2.0/v2.2.1)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
+| [Documentation][(v2.3.0)](https://huggingface.co/transformers/v2.3.0)[(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
 
 ## Installation
 
@@ -144,7 +146,9 @@ At some point in the future, you'll be able to seamlessly move from pre-training
 9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 11. **[ALBERT](https://github.com/google-research/ALBERT)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-11. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
+12. **[T5](https://github.com/google-research/text-to-text-transfer-transformer)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+13. **[XLM-RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/xlmr)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+14. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
 
 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
 
@@ -166,7 +170,7 @@ import torch
 from transformers import *
 
 # Transformers has a unified API
-# for 8 transformer architectures and 30 pretrained weights.
+# for 10 transformer architectures and 30 pretrained weights.
 #          Model          | Tokenizer          | Pretrained weights shortcut
 MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
           (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
@@ -176,7 +180,9 @@ MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
           (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
           (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
           (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
-          (RobertaModel,    RobertaTokenizer,    'roberta-base')]
+          (RobertaModel,    RobertaTokenizer,    'roberta-base'),
+          (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
+         ]
 
 # To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`
 
@@ -445,6 +451,76 @@ python ./examples/run_generation.py \
     --repetition_penalty=1.2 \
 ```
 
+## Quick tour of model sharing
+
+New in `v2.2.2`: you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
+
+**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
+
+```shell
+transformers-cli login
+# log in using the same credentials as on huggingface.co
+```
+Upload your model:
+```shell
+transformers-cli upload ./path/to/pretrained_model/
+
+# ^^ Upload folder containing weights/tokenizer/config
+# saved via `.save_pretrained()`
+
+transformers-cli upload ./config.json [--filename folder/foobar.json]
+
+# ^^ Upload a single file
+# (you can optionally override its filename, which can be nested inside a folder)
+```
+
+Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
+```python
+"username/model_name"
+```
+
+Anyone can load it from code:
+```python
+tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
+model = AutoModel.from_pretrained("username/pretrained_model")
+```
+
+Finally, list all your files on S3:
+```shell
+transformers-cli ls
+# List all your S3 objects.
+```
+
+## Quick tour of pipelines
+
+New in version `v2.3`: `Pipeline` are high-level objects which automatically handle tokenization, running your data through a transformers model
+and outputting the result in a structured object. 
+
+You can create `Pipeline` objects for the following down-stream tasks:
+
+ - `feature-extraction`: Generates a tensor representation for the input sequence
+ - `ner`: Generates named entity mapping for each word in the input sequence.
+ - `sentiment-analysis`: Gives the polarity (positive / negative) of the whole input sequence. 
+ - `question-answering`: Provided some context and a question refering to the context, it will extract the answer to the question
+ in the context.
+
+```python
+from transformers import pipeline
+
+# Allocate a pipeline for sentiment-analysis
+nlp = pipeline('sentiment-analysis')
+nlp('We are very happy to include pipeline into the transformers repository.')
+>>> {'label': 'POSITIVE', 'score': 0.99893874}
+
+# Allocate a pipeline for question-answering
+nlp = pipeline('question-answering')
+nlp({
+    'question': 'What is the name of the repository ?', 
+    'context': 'Pipeline have been included in the huggingface/transformers repository'
+})
+>>> {'score': 0.28756016668193496, 'start': 35, 'end': 59, 'answer': 'huggingface/transformers'}
+```
+
 ## Migrating from pytorch-transformers to transformers
 
 Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 2f8505ab3a..41a65eec29 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -26,7 +26,7 @@ author = u'huggingface'
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'2.2.1'
+release = u'2.3.0'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 84012fc6cf..0ac9c740a5 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -50,6 +50,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
 9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
 11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+12. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 
 .. toctree::
     :maxdepth: 2
@@ -58,6 +59,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
     installation
     quickstart
     pretrained_models
+    model_sharing
     examples
     notebooks
     serialization
diff --git a/docs/source/model_sharing.md b/docs/source/model_sharing.md
new file mode 100644
index 0000000000..95baafb575
--- /dev/null
+++ b/docs/source/model_sharing.md
@@ -0,0 +1,40 @@
+# Model upload and sharing
+
+Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
+
+**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
+
+```shell
+transformers-cli login
+# log in using the same credentials as on huggingface.co
+```
+Upload your model:
+```shell
+transformers-cli upload ./path/to/pretrained_model/
+
+# ^^ Upload folder containing weights/tokenizer/config
+# saved via `.save_pretrained()`
+
+transformers-cli upload ./config.json [--filename folder/foobar.json]
+
+# ^^ Upload a single file
+# (you can optionally override its filename, which can be nested inside a folder)
+```
+
+Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
+```python
+"username/pretrained_model"
+```
+
+Anyone can load it from code:
+```python
+tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
+model = AutoModel.from_pretrained("username/pretrained_model")
+```
+
+Finally, list all your files on S3:
+```shell
+transformers-cli ls
+# List all your S3 objects.
+```
+
diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index dd61f11769..eb7b41ffc9 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -3,6 +3,7 @@ Pretrained models
 
 Here is the full list of the currently provided pretrained models together with a short presentation of each model.
 
+For a list that includes community-uploaded models, refer to `https://huggingface.co/models <https://huggingface.co/models>`__.
 
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | Architecture      | Shortcut name                                              | Details of the model                                                                                                                  |
@@ -61,6 +62,32 @@ Here is the full list of the currently provided pretrained models together with
 |                   | ``bert-base-german-dbmdz-uncased``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on uncased German text by DBMDZ                                                                                             |
 |                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-japanese``                                     | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece.                                                               |
+|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-japanese-whole-word-masking``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece.                                      |
+|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-japanese-char``                                | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on Japanese text. Text is tokenized into characters.                                                                        |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-japanese-char-whole-word-masking``             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                               |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-finnish-cased-v1``                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on cased Finnish text.                                                                                                      |
+|                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-finnish-uncased-v1``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on uncased Finnish text.                                                                                                    |
+|                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | OpenAI GPT English model                                                                                                            |
@@ -128,6 +155,10 @@ Here is the full list of the currently provided pretrained models together with
 |                   |                                                            | | ``roberta-large`` fine-tuned on `MNLI <http://www.nyu.edu/projects/bowman/multinli/>`__.                                            |
 |                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``distilroberta-base``                                     | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
+|                   |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``roberta-base-openai-detector``                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
 |                   |                                                            | | ``roberta-base`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model.                                             |
 |                   |                                                            | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__)                                               |
@@ -148,10 +179,6 @@ Here is the full list of the currently provided pretrained models together with
 |                   |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``distilroberta-base``                                     | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
-|                   |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilbert-base-german-cased``                           | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                   |                                                            | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint.                   |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
@@ -169,35 +196,56 @@ Here is the full list of the currently provided pretrained models together with
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | ALBERT            | ``albert-base-v1``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
 |                   |                                                            | | ALBERT base model                                                                                                                   |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-large-v1``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
 |                   |                                                            | | ALBERT large model                                                                                                                  |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xlarge-v1``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
 |                   |                                                            | | ALBERT xlarge model                                                                                                                 |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xxlarge-v1``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
 |                   |                                                            | | ALBERT xxlarge model                                                                                                                |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-base-v2``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
 |                   |                                                            | | ALBERT base model with no dropout, additional training data and longer training                                                     |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-large-v2``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
 |                   |                                                            | | ALBERT large model with no dropout, additional training data and longer training                                                    |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xlarge-v2``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
 |                   |                                                            | | ALBERT xlarge model with no dropout, additional training data and longer training                                                   |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xxlarge-v2``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
 |                   |                                                            | | ALBERT xxlarge model with no dropout, additional training data and longer training                                                  |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| T5                | ``t5-small``                                               | | ~60M parameters with 6-layers, 512-hidden-state, 2048 feed-forward hidden-state, 8-heads,                                           |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``t5-base``                                                | | ~220M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 12-heads,                                        |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``t5-large``                                               | | ~770M parameters with 24-layers, 1024-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``t5-3B``                                                  | | ~2.8B parameters with 24-layers, 1024-hidden-state, 16384 feed-forward hidden-state, 32-heads,                                      |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``t5-11B``                                                 | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads,                                      |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| XLM-RoBERTa       | ``xlm-roberta-base``                                       | | ~125M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 8-heads,                                         |
+|                   |                                                            | | Trained on on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                       |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-roberta-large``                                      | | ~355M parameters with 24-layers, 1027-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
+|                   |                                                            | | Trained on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                          |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 
 
diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md
index 530aff8eb0..60e2cf3fd8 100644
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -219,4 +219,97 @@ sequence = tokenizer.decode(generated)
 print(sequence)
 ```
 
-The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
\ No newline at end of file
+The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
+
+### Model2Model example
+
+Encoder-decoder architectures require two tokenized inputs: one for the encoder and the other one for the decoder. Let's assume that we want to use `Model2Model` for generative question answering, and start by tokenizing the question and answer that will be fed to the model.
+
+```python
+import torch
+from transformers import BertTokenizer, Model2Model
+
+# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
+import logging
+logging.basicConfig(level=logging.INFO)
+
+# Load pre-trained model tokenizer (vocabulary)
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+# Encode the input to the encoder (the question)
+question = "Who was Jim Henson?"
+encoded_question = tokenizer.encode(question)
+
+# Encode the input to the decoder (the answer)
+answer = "Jim Henson was a puppeteer"
+encoded_answer = tokenizer.encode(answer)
+
+# Convert inputs to PyTorch tensors
+question_tensor = torch.tensor([encoded_question])
+answer_tensor = torch.tensor([encoded_answer])
+```
+
+Let's see how we can use `Model2Model` to get the value of the loss associated with this (question, answer) pair:
+
+```python
+# In order to compute the loss we need to provide language model
+# labels (the token ids that the model should have produced) to
+# the decoder.
+lm_labels =  encoded_answer
+labels_tensor = torch.tensor([lm_labels])
+
+# Load pre-trained model (weights)
+model = Model2Model.from_pretrained('bert-base-uncased')
+
+# Set the model in evaluation mode to deactivate the DropOut modules
+# This is IMPORTANT to have reproducible results during evaluation!
+model.eval()
+
+# If you have a GPU, put everything on cuda
+question_tensor = question_tensor.to('cuda')
+answer_tensor = answer_tensor.to('cuda')
+labels_tensor = labels_tensor.to('cuda')
+model.to('cuda')
+
+# Predict hidden states features for each layer
+with torch.no_grad():
+    # See the models docstrings for the detail of the inputs
+    outputs = model(question_tensor, answer_tensor, decoder_lm_labels=labels_tensor)
+    # Transformers models always output tuples.
+    # See the models docstrings for the detail of all the outputs
+    # In our case, the first element is the value of the LM loss 
+    lm_loss = outputs[0]
+```
+
+This loss can be used to fine-tune `Model2Model` on the question answering task. Assuming that we fine-tuned the model, let us now see how to generate an answer:
+
+```python
+# Let's re-use the previous question
+question = "Who was Jim Henson?"
+encoded_question = tokenizer.encode(question)
+question_tensor = torch.tensor([encoded_question])
+
+# This time we try to generate the answer, so we start with an empty sequence
+answer = "[CLS]"
+encoded_answer = tokenizer.encode(answer, add_special_tokens=False)
+answer_tensor = torch.tensor([encoded_answer])
+
+# Load pre-trained model (weights)
+model = Model2Model.from_pretrained('fine-tuned-weights')
+model.eval()
+
+# If you have a GPU, put everything on cuda
+question_tensor = encoded_question.to('cuda')
+answer_tensor = encoded_answer.to('cuda')
+model.to('cuda')
+
+# Predict all tokens
+with torch.no_grad():
+    outputs = model(question_tensor, answer_tensor)
+    predictions = outputs[0]
+
+# confirm we were able to predict 'jim'
+predicted_index = torch.argmax(predictions[0, -1]).item()
+predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+assert predicted_token == 'jim'
+```
diff --git a/examples/README.md b/examples/README.md
index 620304ea77..fcd2fe1f6f 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -24,8 +24,6 @@ pip install -r ./examples/requirements.txt
 | [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
 | [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training.                                                                                  |
 | [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
-| [Abstractive summarization](#abstractive-summarization) | Using the BertAbs
-model finetuned on the CNN/DailyMail dataset to generate summaries. |
 
 ## TensorFlow 2.0 Bert models on GLUE
 
@@ -469,7 +467,7 @@ Training with the previously defined hyper-parameters yields the following resul
 ## Named Entity Recognition
 
 Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) for Pytorch and
-[`run_tf_ner.py`(https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py)] for Tensorflow 2.
+[`run_tf_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py) for Tensorflow 2.
 This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
 Details and results for the fine-tuning provided by @stefan-it.
 
@@ -646,34 +644,6 @@ micro avg     0.8722    0.8774    0.8748     13869
 macro avg     0.8712    0.8774    0.8740     13869
 ```
 
-## Abstractive summarization
-
-Based on the script
-[`run_summarization_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_summarization_finetuning.py).
-
-Before running this script you should download **both** CNN and Daily Mail
-datasets from [Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/)  (the
-links next to "Stories") in the same folder. Then uncompress the archives by running:
-
-```bash
-tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
-```
-
-note that the finetuning script **will not work** if you do not download both
-datasets. We will refer as `$DATA_PATH` the path to where you uncompressed both
-archive.
-
-```bash
-export DATA_PATH=/path/to/dataset/
-
-python run_summarization_finetuning.py \
-    --output_dir=output \
-    --model_type=bert2bert \
-    --model_name_or_path=bert2bert \
-    --do_train \
-    --data_path=$DATA_PATH \
-```
-
 ## XNLI
 
 Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/blob/master/examples/run_xnli.py).
diff --git a/examples/run_generation.py b/examples/run_generation.py
index ade85f0269..536d4a18f0 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -212,9 +212,10 @@ def main():
         repetition_penalty=args.repetition_penalty,
     )
 
-    generated_sequence = output_sequences.tolist()
-    text = [tokenizer.decode(seq, clean_up_tokenization_spaces=True) for seq in generated_sequence]
-    # text = text[: text.find(args.stop_token) if args.stop_token else None]
+    # Batch size == 1. to add more examples please use num_return_sequences > 1 
+    generated_sequence = output_sequences[0].tolist()
+    text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
+    text = text[: t.find(args.stop_token) if args.stop_token else None]
 
     print(text)
 
diff --git a/examples/run_glue.py b/examples/run_glue.py
index 369a7110ab..954a8fbf0c 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -52,6 +52,9 @@ from transformers import (WEIGHTS_NAME, BertConfig,
                                   AlbertConfig,
                                   AlbertForSequenceClassification, 
                                   AlbertTokenizer,
+                                  XLMRobertaConfig,
+                                  XLMRobertaForSequenceClassification,
+                                  XLMRobertaTokenizer,
                                 )
 
 from transformers import AdamW, get_linear_schedule_with_warmup
@@ -72,7 +75,8 @@ MODEL_CLASSES = {
     'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
     'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
     'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
-    'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer)
+    'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
+    'xlmroberta': (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
 }
 
 
@@ -304,9 +308,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
     else:
         logger.info("Creating features from dataset file at %s", args.data_dir)
         label_list = processor.get_labels()
-        if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
+        if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta', 'xlmroberta']:
             # HACK(label indices are swapped in RoBERTa pretrained model)
-            label_list[1], label_list[2] = label_list[2], label_list[1] 
+            label_list[1], label_list[2] = label_list[2], label_list[1]
         examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
         features = convert_examples_to_features(examples,
                                                 tokenizer,
@@ -380,7 +384,7 @@ def main():
     parser.add_argument("--learning_rate", default=5e-5, type=float,
                         help="The initial learning rate for Adam.")
     parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
+                        help="Weight decay if we apply some.")
     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                         help="Epsilon for Adam optimizer.")
     parser.add_argument("--max_grad_norm", default=1.0, type=float,
diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index c4c73e71af..75848d5acc 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -430,7 +430,7 @@ def main():
     parser.add_argument("--learning_rate", default=5e-5, type=float,
                         help="The initial learning rate for Adam.")
     parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
+                        help="Weight decay if we apply some.")
     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                         help="Epsilon for Adam optimizer.")
     parser.add_argument("--max_grad_norm", default=1.0, type=float,
diff --git a/examples/run_ner.py b/examples/run_ner.py
index 1ab1236d94..6426a6d1db 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -38,11 +38,13 @@ from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, B
 from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer
 from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer
 from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer
+from transformers import XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer
 
 logger = logging.getLogger(__name__)
 
 ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig,
+                                                                   CamembertConfig, XLMRobertaConfig)),
     ())
 
 MODEL_CLASSES = {
@@ -50,6 +52,7 @@ MODEL_CLASSES = {
     "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
     "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
     "camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer),
+    "xlmroberta": (XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer),
 }
 
 
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 2df29014ef..1ff6983f62 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -61,7 +61,6 @@ MODEL_CLASSES = {
     'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
     'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
     'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
-    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer)
 }
 
 def set_seed(args):
@@ -223,7 +222,7 @@ def evaluate(args, model, tokenizer, prefix=""):
     eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
     # multi-gpu evaluate
-    if args.n_gpu > 1:
+    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
         model = torch.nn.DataParallel(model)
 
     # Eval!
@@ -299,10 +298,13 @@ def evaluate(args, model, tokenizer, prefix=""):
 
     # XLNet and XLM use a more complex post-processing procedure
     if args.model_type in ['xlnet', 'xlm']:
+        start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
+        end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
+
         predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size,
                         args.max_answer_length, output_prediction_file,
                         output_nbest_file, output_null_log_odds_file,
-                        model.config.start_n_top, model.config.end_n_top,
+                        start_n_top, end_n_top,
                         args.version_2_with_negative, tokenizer, args.verbose_logging)
     else:
         predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size,
@@ -334,7 +336,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
     else:
         logger.info("Creating features from dataset file at %s", input_dir)
 
-        if not args.data_dir:
+        if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
             try:
                 import tensorflow_datasets as tfds
             except ImportError:
@@ -347,7 +349,11 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
             examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
         else:
             processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
-            examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+
+            if evaluate:
+                examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
+            else:
+                examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
 
         features, dataset = squad_convert_examples_to_features( 
             examples=examples,
@@ -384,7 +390,14 @@ def main():
 
     ## Other parameters
     parser.add_argument("--data_dir", default=None, type=str,
-                        help="The input data dir. Should contain the .json files for the task. If not specified, will run with tensorflow_datasets.")
+                        help="The input data dir. Should contain the .json files for the task." +
+                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
+    parser.add_argument("--train_file", default=None, type=str,
+                        help="The input training file. If a data dir is specified, will look for the file there" +
+                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
+    parser.add_argument("--predict_file", default=None, type=str,
+                        help="The input evaluation file. If a data dir is specified, will look for the file there" +
+                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
     parser.add_argument("--config_name", default="", type=str,
                         help="Pretrained config name or path if not the same as model_name")
     parser.add_argument("--tokenizer_name", default="", type=str,
@@ -469,11 +482,6 @@ def main():
     parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
     args = parser.parse_args()
 
-    args.predict_file = os.path.join(args.output_dir, 'predictions_{}_{}.txt'.format(
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length))
-    )
-
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
         raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
 
@@ -571,10 +579,16 @@ def main():
     # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+
+        if args.do_train:
+            logger.info("Loading checkpoints saved during training for evaluation")
+            checkpoints = [args.output_dir]
+            if args.eval_all_checkpoints:
+                checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+                logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+        else:
+            logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
+            checkpoints = [args.model_name_or_path]
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
 
diff --git a/examples/summarization/README.md b/examples/summarization/README.md
index 96825cfa46..b98581e8e5 100644
--- a/examples/summarization/README.md
+++ b/examples/summarization/README.md
@@ -29,7 +29,7 @@ And move all the stories to the same folder. We will refer as `$DATA_PATH` the p
 python run_summarization.py \
     --documents_dir $DATA_PATH \
     --summaries_output_dir $SUMMARIES_PATH \ # optional
-    --to_cpu false \
+    --no_cuda false \
     --batch_size 4 \
     --min_length 50 \
     --max_length 200 \
@@ -39,7 +39,7 @@ python run_summarization.py \
     --compute_rouge true
 ```
 
-The scripts executes on GPU if one is available and if `to_cpu` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
+The scripts executes on GPU if one is available and if `no_cuda` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
 
 ## Summarize any text
 
@@ -49,7 +49,7 @@ Put the documents that you would like to summarize in a folder (the path to whic
 python run_summarization.py \
     --documents_dir $DATA_PATH \
     --summaries_output_dir $SUMMARIES_PATH \ # optional
-    --to_cpu false \
+    --no_cuda false \
     --batch_size 4 \
     --min_length 50 \
     --max_length 200 \
diff --git a/examples/summarization/configuration_bertabs.py b/examples/summarization/configuration_bertabs.py
index 5bcb65b423..b862d58d2b 100644
--- a/examples/summarization/configuration_bertabs.py
+++ b/examples/summarization/configuration_bertabs.py
@@ -33,6 +33,8 @@ class BertAbsConfig(PretrainedConfig):
     r""" Class to store the configuration of the BertAbs model.
 
     Arguments:
+        vocab_size: int
+            Number of tokens in the vocabulary.
         max_pos: int
             The maximum sequence length that this model will be used with.
         enc_layer: int
@@ -65,7 +67,7 @@ class BertAbsConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size_or_config_json_file=30522,
+        vocab_size=30522,
         max_pos=512,
         enc_layers=6,
         enc_hidden_size=512,
@@ -81,39 +83,17 @@ class BertAbsConfig(PretrainedConfig):
     ):
         super(BertAbsConfig, self).__init__(**kwargs)
 
-        if self._input_is_path_to_json(vocab_size_or_config_json_file):
-            path_to_json = vocab_size_or_config_json_file
-            with open(path_to_json, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.max_pos = max_pos
+        self.vocab_size = vocab_size
+        self.max_pos = max_pos
 
-            self.enc_layers = enc_layers
-            self.enc_hidden_size = enc_hidden_size
-            self.enc_heads = enc_heads
-            self.enc_ff_size = enc_ff_size
-            self.enc_dropout = enc_dropout
+        self.enc_layers = enc_layers
+        self.enc_hidden_size = enc_hidden_size
+        self.enc_heads = enc_heads
+        self.enc_ff_size = enc_ff_size
+        self.enc_dropout = enc_dropout
 
-            self.dec_layers = dec_layers
-            self.dec_hidden_size = dec_hidden_size
-            self.dec_heads = dec_heads
-            self.dec_ff_size = dec_ff_size
-            self.dec_dropout = dec_dropout
-        else:
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
-
-    def _input_is_path_to_json(self, first_argument):
-        """ Checks whether the first argument passed to config
-        is the path to a JSON file that contains the config.
-        """
-        is_python_2 = sys.version_info[0] == 2
-        if is_python_2:
-            return isinstance(first_argument, unicode)
-        else:
-            return isinstance(first_argument, str)
+        self.dec_layers = dec_layers
+        self.dec_hidden_size = dec_hidden_size
+        self.dec_heads = dec_heads
+        self.dec_ff_size = dec_ff_size
+        self.dec_dropout = dec_dropout
diff --git a/requirements.txt b/requirements.txt
index 9c43abc6d7..32edee0712 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ boto3
 # Used for downloading models over HTTP
 requests
 # For OpenAI GPT
-regex
+regex != 2019.12.17
 # For XLNet
 sentencepiece
 # For XLM
diff --git a/setup.py b/setup.py
index c4af32df83..fe2e1526bf 100644
--- a/setup.py
+++ b/setup.py
@@ -38,13 +38,15 @@ from setuptools import find_packages, setup
 
 
 extras = {
-    'serving': ['uvicorn', 'fastapi']
+    'serving': ['pydantic', 'uvicorn', 'fastapi'],
+    'serving-tf': ['pydantic', 'uvicorn', 'fastapi', 'tensorflow'],
+    'serving-torch': ['pydantic', 'uvicorn', 'fastapi', 'torch']
 }
 extras['all'] = [package for package in extras.values()]
 
 setup(
     name="transformers",
-    version="2.2.1",
+    version="2.3.0",
     author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
     author_email="thomas@huggingface.co",
     description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
@@ -57,16 +59,12 @@ setup(
                                     "tests.*", "tests"]),
     install_requires=['numpy',
                       'boto3',
+                      'filelock',
                       'requests',
                       'tqdm',
-                      'regex',
+                      'regex != 2019.12.17',
                       'sentencepiece',
                       'sacremoses'],
-    entry_points={
-      'console_scripts': [
-        "transformers=transformers.__main__:main",
-      ]
-    },
     extras_require=extras,
     scripts=[
         'transformers-cli'
diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py
index b1614e71af..12d69799a9 100644
--- a/templates/adding_a_new_model/configuration_xxx.py
+++ b/templates/adding_a_new_model/configuration_xxx.py
@@ -39,7 +39,7 @@ class XxxConfig(PretrainedConfig):
 
 
         Arguments:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XxxModel`.
+            vocab_size: Vocabulary size of `inputs_ids` in `XxxModel`.
             hidden_size: Size of the encoder layers and the pooler layer.
             num_hidden_layers: Number of hidden layers in the Transformer encoder.
             num_attention_heads: Number of attention heads for each attention layer in
@@ -64,7 +64,7 @@ class XxxConfig(PretrainedConfig):
     pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file=50257,
+                 vocab_size=50257,
                  n_positions=1024,
                  n_ctx=1024,
                  n_embd=768,
@@ -75,8 +75,6 @@ class XxxConfig(PretrainedConfig):
                  attn_pdrop=0.1,
                  layer_norm_epsilon=1e-5,
                  initializer_range=0.02,
-
-                 num_labels=1,
                  summary_type='cls_index',
                  summary_use_proj=True,
                  summary_activation=None,
@@ -84,7 +82,7 @@ class XxxConfig(PretrainedConfig):
                  summary_first_dropout=0.1,
                  **kwargs):
         super(XxxConfig, self).__init__(**kwargs)
-        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, six.string_types) else -1
+        self.vocab_size = vocab_size
         self.n_ctx = n_ctx
         self.n_positions = n_positions
         self.n_embd = n_embd
@@ -95,23 +93,11 @@ class XxxConfig(PretrainedConfig):
         self.attn_pdrop = attn_pdrop
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_range = initializer_range
-
-        self.num_labels = num_labels
         self.summary_type = summary_type
         self.summary_use_proj = summary_use_proj
         self.summary_activation = summary_activation
         self.summary_first_dropout = summary_first_dropout
         self.summary_proj_to_labels = summary_proj_to_labels
-        if isinstance(vocab_size_or_config_json_file, six.string_types):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif not isinstance(vocab_size_or_config_json_file, int):
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
 
     @property
     def max_position_embeddings(self):
diff --git a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
index d50d129cba..9d389deaad 100755
--- a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
+++ b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
@@ -26,9 +26,9 @@ from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx
 import logging
 logging.basicConfig(level=logging.INFO)
 
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, xxx_config_file, pytorch_dump_path):
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
     # Initialise PyTorch model
-    config = XxxConfig.from_json_file(xxx_config_file)
+    config = XxxConfig.from_json_file(config_file)
     print("Building PyTorch model from configuration: {}".format(str(config)))
     model = XxxForPreTraining(config)
 
@@ -48,11 +48,11 @@ if __name__ == "__main__":
                         type = str,
                         required = True,
                         help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--xxx_config_file",
+    parser.add_argument("--config_file",
                         default = None,
                         type = str,
                         required = True,
-                        help = "The config json file corresponding to the pre-trained XXX model. \n"
+                        help = "The config json file corresponding to the pre-trained model. \n"
                             "This specifies the model architecture.")
     parser.add_argument("--pytorch_dump_path",
                         default = None,
@@ -61,5 +61,5 @@ if __name__ == "__main__":
                         help = "Path to the output PyTorch model.")
     args = parser.parse_args()
     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.xxx_config_file,
+                                     args.config_file,
                                      args.pytorch_dump_path)
diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py
index 59f798bdbf..1783620998 100644
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -26,6 +26,8 @@ import logging
 import math
 import os
 import sys
+import copy
+import itertools
 from io import open
 
 import numpy as np
diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py
index 94c4b0db9a..c8be58e288 100644
--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -25,6 +25,8 @@ import logging
 import math
 import os
 import sys
+import copy
+import itertools
 from io import open
 
 import torch
diff --git a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
index d7e576bf8b..6eba932a8e 100644
--- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
@@ -17,12 +17,11 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import shutil
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 
 from transformers import XxxConfig, is_tf_available
 
@@ -111,7 +110,7 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = XxxConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 hidden_size=self.hidden_size,
                 num_hidden_layers=self.num_hidden_layers,
                 num_attention_heads=self.num_attention_heads,
@@ -245,10 +244,8 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in ['xxx-base-uncased']:
-            model = TFXxxModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = TFXxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 if __name__ == "__main__":
diff --git a/templates/adding_a_new_model/tests/modeling_xxx_test.py b/templates/adding_a_new_model/tests/modeling_xxx_test.py
index bfc70921cd..5e22392d00 100644
--- a/templates/adding_a_new_model/tests/modeling_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py
@@ -17,13 +17,12 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import shutil
 
 from transformers import is_torch_available
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 if is_torch_available():
     from transformers import (XxxConfig, XxxModel, XxxForMaskedLM,
@@ -109,7 +108,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = XxxConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 hidden_size=self.hidden_size,
                 num_hidden_layers=self.num_hidden_layers,
                 num_attention_heads=self.num_attention_heads,
@@ -249,10 +248,8 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(XXX_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = XxxModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = XxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 if __name__ == "__main__":
diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py
index 3d6b4ad9df..7a10a41e5a 100644
--- a/templates/adding_a_new_model/tokenization_xxx.py
+++ b/templates/adding_a_new_model/tokenization_xxx.py
@@ -85,7 +85,7 @@ class XxxTokenizer(PreTrainedTokenizer):
 
     Args:
         vocab_file: Path to a one-wordpiece-per-line vocabulary file
-        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
+        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/transformers-cli b/transformers-cli
old mode 100644
new mode 100755
index ef00d15aa3..0a980a3574
--- a/transformers-cli
+++ b/transformers-cli
@@ -1,14 +1,21 @@
 #!/usr/bin/env python
 from argparse import ArgumentParser
 
+from transformers.commands.download import DownloadCommand
+from transformers.commands.run import RunCommand
 from transformers.commands.user import UserCommands
-
+from transformers.commands.convert import ConvertCommand
+from transformers.commands.serving import ServeCommand
 
 if __name__ == '__main__':
-    parser = ArgumentParser(description='Transformers CLI tool', usage='transformers-cli <command> [<args>]')
+    parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli <command> [<args>]')
     commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
 
     # Register commands
+    ConvertCommand.register_subcommand(commands_parser)
+    DownloadCommand.register_subcommand(commands_parser)
+    RunCommand.register_subcommand(commands_parser)
+    ServeCommand.register_subcommand(commands_parser)
     UserCommands.register_subcommand(commands_parser)
 
     # Let's go
diff --git a/transformers/__init__.py b/transformers/__init__.py
old mode 100644
new mode 100755
index f9a28add5f..c0c0901df4
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "2.2.1"
+__version__ = "2.3.0"
 
 # Work around to update TensorFlow's absl.logging threshold which alters the
 # default Python logging output behavior when present.
@@ -19,11 +19,12 @@ logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 # Files and general utilities
 from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
                          cached_path, add_start_docstrings, add_end_docstrings,
-                         WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME,
+                         WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME, MODEL_CARD_NAME,
                          is_tf_available, is_torch_available)
 
 from .data import (is_sklearn_available,
                    InputExample, InputFeatures, DataProcessor,
+                   SingleSentenceClassificationProcessor,
                    glue_output_modes, glue_convert_examples_to_features,
                    glue_processors, glue_tasks_num_labels,
                    xnli_output_modes, xnli_processors, xnli_tasks_num_labels,
@@ -33,10 +34,14 @@ from .data import (is_sklearn_available,
 if is_sklearn_available():
     from .data import glue_compute_metrics, xnli_compute_metrics
 
+# Model Cards
+from .modelcard import ModelCard
+
 # Tokenizers
 from .tokenization_utils import (PreTrainedTokenizer)
 from .tokenization_auto import AutoTokenizer
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+from .tokenization_bert_japanese import BertJapaneseTokenizer, MecabTokenizer, CharacterTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
@@ -47,28 +52,31 @@ from .tokenization_roberta import RobertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
 from .tokenization_albert import AlbertTokenizer
 from .tokenization_camembert import CamembertTokenizer
+from .tokenization_t5 import T5Tokenizer
+from .tokenization_xlm_roberta import XLMRobertaTokenizer
 
 # Configurations
 from .configuration_utils import PretrainedConfig
-from .configuration_auto import AutoConfig
+from .configuration_auto import AutoConfig, ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlm_roberta import XLMRobertaConfig, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
 
 # Modeling
 if is_torch_available():
     from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
     from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
-                                AutoModelWithLMHead)
+                                AutoModelWithLMHead, AutoModelForTokenClassification, ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
 
     from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
                                 BertForMaskedLM, BertForNextSentencePrediction,
@@ -76,8 +84,8 @@ if is_torch_available():
                                 BertForTokenClassification, BertForQuestionAnswering,
                                 load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
     from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
-                                OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
-                                load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
+                                  OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
+                                  load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
     from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
                                     AdaptiveEmbedding,
                                     load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
@@ -109,11 +117,17 @@ if is_torch_available():
                                 CamembertForTokenClassification,
                                 CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
     from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
+    from .modeling_t5 import (T5PreTrainedModel, T5Model, T5WithLMHeadModel,
+                              load_tf_weights_in_t5,
+                              T5_PRETRAINED_MODEL_ARCHIVE_MAP)
 
     from .modeling_albert import (AlbertPreTrainedModel, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification,
                                 AlbertForQuestionAnswering,
                                 load_tf_weights_in_albert, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
 
+    from .modeling_xlm_roberta import (XLMRobertaForMaskedLM, XLMRobertaModel, XLMRobertaForMultipleChoice,
+                                       XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification)
+
     # Optimization
     from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
                                get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup)
@@ -123,7 +137,7 @@ if is_torch_available():
 if is_tf_available():
     from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
     from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
-                                   TFAutoModelWithLMHead)
+                                   TFAutoModelWithLMHead, TFAutoModelForTokenClassification, TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
 
     from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings,
                                    TFBertModel, TFBertForPreTraining,
@@ -177,6 +191,10 @@ if is_tf_available():
     from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM,
                                      TFAlbertForSequenceClassification,
                                     TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+    from .modeling_tf_t5 import (TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel,
+                                 TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+
     # Optimization
     from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator)
 
@@ -189,6 +207,10 @@ from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name
                                         load_tf2_weights_in_pytorch_model,
                                         load_tf2_model_in_pytorch_model)
 
+# Pipelines
+from .pipelines import pipeline, PipelineDataFormat, CsvPipelineDataFormat, JsonPipelineDataFormat, PipedPipelineDataFormat, \
+    Pipeline, FeatureExtractionPipeline, QuestionAnsweringPipeline, NerPipeline, TextClassificationPipeline
+
 if not is_tf_available() and not is_torch_available():
     logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found."
                    "Models won't be available and only tokenizers, configuration"
diff --git a/transformers/__main__.py b/transformers/__main__.py
index 31dbd24908..dd259b04ee 100644
--- a/transformers/__main__.py
+++ b/transformers/__main__.py
@@ -1,129 +1,37 @@
 # coding: utf8
+
 def main():
     import sys
-    if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
+    if len(sys.argv) < 2 or sys.argv[1] not in ["convert", "train", "predict", "serve"]:
         print(
-        "This command line utility let you convert original (author released) model checkpoint to pytorch.\n"
-        "It should be used as one of: \n"
-        ">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
-        ">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
-        ">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
-        ">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
-        ">> transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
-        ">> transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
-    else:
-        if sys.argv[1] == "bert":
-            try:
-                from .convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
-            except ImportError:
-                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions.")
-                raise
+        "First argument to `transformers` command line interface should be one of: \n"
+        ">> convert serve train predict")
+    if sys.argv[1] == "convert":
+        from transformers.commands import convert
+        convert(sys.argv)
+    elif sys.argv[1] == "train":
+        from transformers.commands import train
+        train(sys.argv)
+    elif sys.argv[1] == "serve":
+        pass
+        # from argparse import ArgumentParser
+        # from transformers.commands.serving import ServeCommand
+        # parser = ArgumentParser('Transformers CLI tool', usage='transformers serve <command> [<args>]')
+        # commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
 
-            if len(sys.argv) != 5:
-                # pylint: disable=line-too-long
-                print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
-            else:
-                PYTORCH_DUMP_OUTPUT = sys.argv.pop()
-                TF_CONFIG = sys.argv.pop()
-                TF_CHECKPOINT = sys.argv.pop()
-                convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
-        elif sys.argv[1] == "gpt":
-            from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
-            if len(sys.argv) < 4 or len(sys.argv) > 5:
-                # pylint: disable=line-too-long
-                print("Should be used as `transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
-            else:
-                OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
-                PYTORCH_DUMP_OUTPUT = sys.argv[3]
-                if len(sys.argv) == 5:
-                    OPENAI_GPT_CONFIG = sys.argv[4]
-                else:
-                    OPENAI_GPT_CONFIG = ""
-                convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
-                                                    OPENAI_GPT_CONFIG,
-                                                    PYTORCH_DUMP_OUTPUT)
-        elif sys.argv[1] == "transfo_xl":
-            try:
-                from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
-            except ImportError:
-                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions.")
-                raise
-            if len(sys.argv) < 4 or len(sys.argv) > 5:
-                # pylint: disable=line-too-long
-                print("Should be used as `transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
-            else:
-                if 'ckpt' in sys.argv[2].lower():
-                    TF_CHECKPOINT = sys.argv[2]
-                    TF_DATASET_FILE = ""
-                else:
-                    TF_DATASET_FILE = sys.argv[2]
-                    TF_CHECKPOINT = ""
-                PYTORCH_DUMP_OUTPUT = sys.argv[3]
-                if len(sys.argv) == 5:
-                    TF_CONFIG = sys.argv[4]
-                else:
-                    TF_CONFIG = ""
-                convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
-        elif sys.argv[1] == "gpt2":
-            try:
-                from .convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
-            except ImportError:
-                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions.")
-                raise
 
-            if len(sys.argv) < 4 or len(sys.argv) > 5:
-                # pylint: disable=line-too-long
-                print("Should be used as `transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
-            else:
-                TF_CHECKPOINT = sys.argv[2]
-                PYTORCH_DUMP_OUTPUT = sys.argv[3]
-                if len(sys.argv) == 5:
-                    TF_CONFIG = sys.argv[4]
-                else:
-                    TF_CONFIG = ""
-                convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
-        elif sys.argv[1] == "xlnet":
-            try:
-                from .convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
-            except ImportError:
-                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions.")
-                raise
+        # # Register commands
+        # ServeCommand.register_subcommand(commands_parser)
 
-            if len(sys.argv) < 5 or len(sys.argv) > 6:
-                # pylint: disable=line-too-long
-                print("Should be used as `transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
-            else:
-                TF_CHECKPOINT = sys.argv[2]
-                TF_CONFIG = sys.argv[3]
-                PYTORCH_DUMP_OUTPUT = sys.argv[4]
-                if len(sys.argv) == 6:
-                    FINETUNING_TASK = sys.argv[5]
-                else:
-                    FINETUNING_TASK = None
+        # # Let's go
+        # args = parser.parse_args()
 
-                convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT,
-                                                    TF_CONFIG,
-                                                    PYTORCH_DUMP_OUTPUT,
-                                                    FINETUNING_TASK)
-        elif sys.argv[1] == "xlm":
-            from .convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
-
-            if len(sys.argv) != 4:
-                # pylint: disable=line-too-long
-                print("Should be used as `transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
-            else:
-                XLM_CHECKPOINT_PATH = sys.argv[2]
-                PYTORCH_DUMP_OUTPUT = sys.argv[3]
-
-                convert_xlm_checkpoint_to_pytorch(XLM_CHECKPOINT_PATH, PYTORCH_DUMP_OUTPUT)
+        # if not hasattr(args, 'func'):
+        #     parser.print_help()
+        #     exit(1)
+        # # Run
+        # service = args.func(args)
+        # service.run()
 
 if __name__ == '__main__':
     main()
diff --git a/transformers/commands/convert.py b/transformers/commands/convert.py
new file mode 100644
index 0000000000..55dbf53734
--- /dev/null
+++ b/transformers/commands/convert.py
@@ -0,0 +1,115 @@
+from argparse import ArgumentParser, Namespace
+
+from logging import getLogger
+
+from transformers import AutoModel, AutoTokenizer
+from transformers.commands import BaseTransformersCLICommand
+
+
+def convert_command_factory(args: Namespace):
+    """
+    Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint.
+    :return: ServeCommand
+    """
+    return ConvertCommand(args.model_type, args.tf_checkpoint, args.pytorch_dump_output,
+                          args.config, args.finetuning_task_name)
+
+
+class ConvertCommand(BaseTransformersCLICommand):
+
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        """
+        Register this command to argparse so it's available for the transformer-cli
+        :param parser: Root parser to register command-specific arguments
+        :return:
+        """
+        train_parser = parser.add_parser('convert', help="CLI tool to run convert model from original "
+                                                         "author checkpoints to Transformesr PyTorch checkpoints.")
+        train_parser.add_argument('--model_type', type=str, required=True,
+                                  help='Model\'s type.')
+        train_parser.add_argument('--tf_checkpoint', type=str, required=True,
+                                  help='TensorFlow checkpoint path or folder.')
+        train_parser.add_argument('--pytorch_dump_output', type=str, required=True,
+                                  help='Path to the PyTorch savd model output.')
+        train_parser.add_argument('--config', type=str, default="",
+                                  help='Configuration file path or folder.')
+        train_parser.add_argument('--finetuning_task_name', type=str, default=None,
+                                  help='Optional fine-tuning task name if the TF model was a finetuned model.')
+        train_parser.set_defaults(func=convert_command_factory)
+
+    def __init__(self, model_type: str, tf_checkpoint: str, pytorch_dump_output: str,
+                 config: str, finetuning_task_name: str, *args):
+        self._logger = getLogger('transformers-cli/converting')
+
+        self._logger.info('Loading model {}'.format(model_type))
+        self._model_type = model_type
+        self._tf_checkpoint = tf_checkpoint
+        self._pytorch_dump_output = pytorch_dump_output
+        self._config = config
+        self._finetuning_task_name = finetuning_task_name
+
+    def run(self):
+        if self._model_type == "bert":
+            try:
+                from transformers.convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+            except ImportError:
+                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
+                    "In that case, it requires TensorFlow to be installed. Please see " \
+                    "https://www.tensorflow.org/install/ for installation instructions."
+                raise ImportError(msg)
+
+            convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
+        elif self._model_type == "gpt":
+            from transformers.convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
+            convert_openai_checkpoint_to_pytorch(self._tf_checkpoint,
+                                                    self._config,
+                                                    self._pytorch_dump_output)
+        elif self._model_type == "transfo_xl":
+            try:
+                from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
+            except ImportError:
+                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
+                    "In that case, it requires TensorFlow to be installed. Please see " \
+                    "https://www.tensorflow.org/install/ for installation instructions."
+                raise ImportError(msg)
+
+            if 'ckpt' in self._tf_checkpoint.lower():
+                TF_CHECKPOINT = self._tf_checkpoint
+                TF_DATASET_FILE = ""
+            else:
+                TF_DATASET_FILE = self._tf_checkpoint
+                TF_CHECKPOINT = ""
+            convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT,
+                                                        self._config,
+                                                        self._pytorch_dump_output,
+                                                        TF_DATASET_FILE)
+        elif self._model_type == "gpt2":
+            try:
+                from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
+            except ImportError:
+                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
+                    "In that case, it requires TensorFlow to be installed. Please see " \
+                    "https://www.tensorflow.org/install/ for installation instructions."
+                raise ImportError(msg)
+
+            convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
+        elif self._model_type == "xlnet":
+            try:
+                from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
+            except ImportError:
+                msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \
+                    "In that case, it requires TensorFlow to be installed. Please see " \
+                    "https://www.tensorflow.org/install/ for installation instructions."
+                raise ImportError(msg)
+
+            convert_xlnet_checkpoint_to_pytorch(self._tf_checkpoint,
+                                                self._config,
+                                                self._pytorch_dump_output,
+                                                self._finetuning_task_name)
+        elif self._model_type == "xlm":
+            from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
+
+            convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output)
+        else:
+            raise ValueError("--model_type should be selected in the list [bert, gpt, gpt2, transfo_xl, xlnet, xlm]")
diff --git a/transformers/commands/download.py b/transformers/commands/download.py
new file mode 100644
index 0000000000..0938f135d2
--- /dev/null
+++ b/transformers/commands/download.py
@@ -0,0 +1,29 @@
+from argparse import ArgumentParser
+
+from transformers.commands import BaseTransformersCLICommand
+
+
+def download_command_factory(args):
+    return DownloadCommand(args.model, args.cache_dir, args.force)
+
+
+class DownloadCommand(BaseTransformersCLICommand):
+
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        download_parser = parser.add_parser('download')
+        download_parser.add_argument('--cache-dir', type=str, default=None, help='Path to location to store the models')
+        download_parser.add_argument('--force',  action='store_true', help='Force the model to be download even if already in cache-dir')
+        download_parser.add_argument('model', type=str, help='Name of the model to download')
+        download_parser.set_defaults(func=download_command_factory)
+
+    def __init__(self, model: str, cache: str, force: bool):
+        self._model = model
+        self._cache = cache
+        self._force = force
+
+    def run(self):
+        from transformers import AutoModel, AutoTokenizer
+
+        AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
+        AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
\ No newline at end of file
diff --git a/transformers/commands/run.py b/transformers/commands/run.py
new file mode 100644
index 0000000000..df03cee9d7
--- /dev/null
+++ b/transformers/commands/run.py
@@ -0,0 +1,79 @@
+import logging
+from argparse import ArgumentParser
+
+from transformers.commands import BaseTransformersCLICommand
+from transformers.pipelines import pipeline, Pipeline, PipelineDataFormat, SUPPORTED_TASKS
+
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+def try_infer_format_from_ext(path: str):
+    if not path:
+        return 'pipe'
+
+    for ext in PipelineDataFormat.SUPPORTED_FORMATS:
+        if path.endswith(ext):
+            return ext
+
+    raise Exception(
+        'Unable to determine file format from file extension {}. '
+        'Please provide the format through --format {}'.format(path, PipelineDataFormat.SUPPORTED_FORMATS)
+    )
+
+
+def run_command_factory(args):
+    nlp = pipeline(task=args.task,
+                   model=args.model if args.model else None,
+                   config=args.config,
+                   tokenizer=args.tokenizer,
+                   device=args.device)
+    format = try_infer_format_from_ext(args.input) if args.format == 'infer' else args.format
+    reader = PipelineDataFormat.from_str(format=format,
+                                         output_path=args.output,
+                                         input_path=args.input,
+                                         column=args.column if args.column else nlp.default_input_names,
+                                         overwrite=args.overwrite)
+    return RunCommand(nlp, reader)
+
+
+class RunCommand(BaseTransformersCLICommand):
+
+    def __init__(self, nlp: Pipeline, reader: PipelineDataFormat):
+        self._nlp = nlp
+        self._reader = reader
+
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        run_parser = parser.add_parser('run', help="Run a pipeline through the CLI")
+        run_parser.add_argument('--task', choices=SUPPORTED_TASKS.keys(), help='Task to run')
+        run_parser.add_argument('--input', type=str, help='Path to the file to use for inference')
+        run_parser.add_argument('--output', type=str, help='Path to the file that will be used post to write results.')
+        run_parser.add_argument('--model', type=str, help='Name or path to the model to instantiate.')
+        run_parser.add_argument('--config', type=str, help='Name or path to the model\'s config to instantiate.')
+        run_parser.add_argument('--tokenizer', type=str, help='Name of the tokenizer to use. (default: same as the model name)')
+        run_parser.add_argument('--column', type=str, help='Name of the column to use as input. (For multi columns input as QA use column1,columns2)')
+        run_parser.add_argument('--format', type=str, default='infer', choices=PipelineDataFormat.SUPPORTED_FORMATS, help='Input format to read from')
+        run_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)')
+        run_parser.add_argument('--overwrite', action='store_true', help='Allow overwriting the output file.')
+        run_parser.set_defaults(func=run_command_factory)
+
+    def run(self):
+        nlp, outputs = self._nlp, []
+
+        for entry in self._reader:
+            output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry)
+            if isinstance(output, dict):
+                outputs.append(output)
+            else:
+                outputs += output
+
+        # Saving data
+        if self._nlp.binary_output:
+            binary_path = self._reader.save_binary(outputs)
+            logger.warning('Current pipeline requires output to be in binary format, saving at {}'.format(binary_path))
+        else:
+            self._reader.save(outputs)
+
+
+
diff --git a/transformers/commands/serving.py b/transformers/commands/serving.py
new file mode 100644
index 0000000000..4f41f797d1
--- /dev/null
+++ b/transformers/commands/serving.py
@@ -0,0 +1,158 @@
+from argparse import ArgumentParser, Namespace
+from typing import List, Optional, Union, Any
+
+import logging
+
+try:
+    from uvicorn import run
+    from fastapi import FastAPI, HTTPException, Body
+    from pydantic import BaseModel
+    _serve_dependancies_installed = True
+except (ImportError, AttributeError):
+    BaseModel = object
+    Body = lambda *x, **y: None
+    _serve_dependancies_installed = False
+
+from transformers import Pipeline
+from transformers.commands import BaseTransformersCLICommand
+from transformers.pipelines import SUPPORTED_TASKS, pipeline
+
+logger = logging.getLogger('transformers-cli/serving')
+
+def serve_command_factory(args: Namespace):
+    """
+    Factory function used to instantiate serving server from provided command line arguments.
+    :return: ServeCommand
+    """
+    nlp = pipeline(task=args.task,
+                   model=args.model if args.model else None,
+                   config=args.config,
+                   tokenizer=args.tokenizer,
+                   device=args.device)
+    return ServeCommand(nlp, args.host, args.port)
+
+
+class ServeModelInfoResult(BaseModel):
+    """
+    Expose model information
+    """
+    infos: dict
+
+
+class ServeTokenizeResult(BaseModel):
+    """
+    Tokenize result model
+    """
+    tokens: List[str]
+    tokens_ids: Optional[List[int]]
+
+
+class ServeDeTokenizeResult(BaseModel):
+    """
+    DeTokenize result model
+    """
+    text: str
+
+
+class ServeForwardResult(BaseModel):
+    """
+    Forward result model
+    """
+    output: Any
+
+
+class ServeCommand(BaseTransformersCLICommand):
+
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        """
+        Register this command to argparse so it's available for the transformer-cli
+        :param parser: Root parser to register command-specific arguments
+        :return:
+        """
+        serve_parser = parser.add_parser('serve', help='CLI tool to run inference requests through REST and GraphQL endpoints.')
+        serve_parser.add_argument('--task', type=str, choices=SUPPORTED_TASKS.keys(), help='The task to run the pipeline on')
+        serve_parser.add_argument('--host', type=str, default='localhost', help='Interface the server will listen on.')
+        serve_parser.add_argument('--port', type=int, default=8888, help='Port the serving will listen to.')
+        serve_parser.add_argument('--model', type=str, help='Model\'s name or path to stored model.')
+        serve_parser.add_argument('--config', type=str, help='Model\'s config name or path to stored model.')
+        serve_parser.add_argument('--tokenizer', type=str, help='Tokenizer name to use.')
+        serve_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)')
+        serve_parser.set_defaults(func=serve_command_factory)
+
+    def __init__(self, pipeline: Pipeline, host: str, port: int):
+
+        self._pipeline = pipeline
+
+        self._host = host
+        self._port = port
+        if not _serve_dependancies_installed:
+            raise ImportError("Using serve command requires FastAPI and unicorn. "
+                                "Please install transformers with [serving]: pip install transformers[serving]." 
+                                "Or install FastAPI and unicorn separatly.")
+        else:
+            logger.info('Serving model over {}:{}'.format(host, port))
+            self._app = FastAPI()
+
+            # Register routes
+            self._app.add_api_route('/', self.model_info, response_model=ServeModelInfoResult, methods=['GET'])
+            self._app.add_api_route('/tokenize', self.tokenize, response_model=ServeTokenizeResult, methods=['POST'])
+            self._app.add_api_route('/detokenize', self.detokenize, response_model=ServeDeTokenizeResult, methods=['POST'])
+            self._app.add_api_route('/forward', self.forward, response_model=ServeForwardResult, methods=['POST'])
+
+    def run(self):
+        run(self._app, host=self._host, port=self._port)
+
+    def model_info(self):
+        return ServeModelInfoResult(infos=vars(self._pipeline.model.config))
+
+    def tokenize(self, text_input: str = Body(None, embed=True), return_ids: bool = Body(False, embed=True)):
+        """
+        Tokenize the provided input and eventually returns corresponding tokens id:
+        - **text_input**: String to tokenize
+        - **return_ids**: Boolean flags indicating if the tokens have to be converted to their integer mapping.
+        """
+        try:
+            tokens_txt = self._pipeline.tokenizer.tokenize(text_input)
+
+            if return_ids:
+                tokens_ids = self._pipeline.tokenizer.convert_tokens_to_ids(tokens_txt)
+                return ServeTokenizeResult(tokens=tokens_txt, tokens_ids=tokens_ids)
+            else:
+                return ServeTokenizeResult(tokens=tokens_txt)
+
+        except Exception as e:
+            raise HTTPException(status_code=500, detail={"model": '', "error": str(e)})
+
+    def detokenize(self, tokens_ids: List[int] = Body(None, embed=True),
+                   skip_special_tokens: bool = Body(False, embed=True),
+                   cleanup_tokenization_spaces: bool = Body(True, embed=True)):
+        """
+        Detokenize the provided tokens ids to readable text:
+        - **tokens_ids**: List of tokens ids
+        - **skip_special_tokens**: Flag indicating to not try to decode special tokens
+        - **cleanup_tokenization_spaces**: Flag indicating to remove all leading/trailing spaces and intermediate ones.
+        """
+        try:
+            decoded_str = self._pipeline.tokenizer.decode(tokens_ids, skip_special_tokens, cleanup_tokenization_spaces)
+            return ServeDeTokenizeResult(model='', text=decoded_str)
+        except Exception as e:
+            raise HTTPException(status_code=500, detail={"model": '', "error": str(e)})
+
+    def forward(self, inputs: Union[str, dict, List[str], List[int], List[dict]] = Body(None, embed=True)):
+        """
+        **inputs**:
+        **attention_mask**:
+        **tokens_type_ids**:
+        """
+
+        # Check we don't have empty string
+        if len(inputs) == 0:
+            return ServeForwardResult(output=[], attention=[])
+
+        try:
+            # Forward through the model
+            output = self._pipeline(inputs)
+            return ServeForwardResult(output=output)
+        except Exception as e:
+            raise HTTPException(500, {"error": str(e)})
diff --git a/transformers/commands/train.py b/transformers/commands/train.py
new file mode 100644
index 0000000000..7b26745881
--- /dev/null
+++ b/transformers/commands/train.py
@@ -0,0 +1,131 @@
+import os
+from argparse import ArgumentParser, Namespace
+from logging import getLogger
+
+from transformers.commands import BaseTransformersCLICommand
+from transformers import (is_tf_available, is_torch_available,
+                          TextClassificationPipeline,
+                          SingleSentenceClassificationProcessor as Processor)
+
+if not is_tf_available() and not is_torch_available():
+    raise ImportError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training")
+
+# TF training parameters
+USE_XLA = False
+USE_AMP = False
+
+def train_command_factory(args: Namespace):
+    """
+    Factory function used to instantiate serving server from provided command line arguments.
+    :return: ServeCommand
+    """
+    return TrainCommand(args)
+
+
+class TrainCommand(BaseTransformersCLICommand):
+
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        """
+        Register this command to argparse so it's available for the transformer-cli
+        :param parser: Root parser to register command-specific arguments
+        :return:
+        """
+        train_parser = parser.add_parser('train', help='CLI tool to train a model on a task.')
+
+        train_parser.add_argument('--train_data', type=str, required=True,
+                                  help="path to train (and optionally evaluation) dataset as a csv with "
+                                       "tab separated labels and sentences.")
+        train_parser.add_argument('--column_label', type=int, default=0,
+                                  help='Column of the dataset csv file with example labels.')
+        train_parser.add_argument('--column_text', type=int, default=1,
+                                  help='Column of the dataset csv file with example texts.')
+        train_parser.add_argument('--column_id', type=int, default=2,
+                                  help='Column of the dataset csv file with example ids.')
+        train_parser.add_argument('--skip_first_row', action='store_true',
+                                  help='Skip the first row of the csv file (headers).')
+
+        train_parser.add_argument('--validation_data', type=str, default='',
+                                  help='path to validation dataset.')
+        train_parser.add_argument('--validation_split', type=float, default=0.1,
+                                  help="if validation dataset is not provided, fraction of train dataset "
+                                       "to use as validation dataset.")
+
+        train_parser.add_argument('--output', type=str, default='./',
+                                  help='path to saved the trained model.')
+
+        train_parser.add_argument('--task', type=str, default='text_classification',
+                                  help='Task to train the model on.')
+        train_parser.add_argument('--model', type=str, default='bert-base-uncased',
+                                  help='Model\'s name or path to stored model.')
+        train_parser.add_argument('--train_batch_size', type=int, default=32,
+                                  help='Batch size for training.')
+        train_parser.add_argument('--valid_batch_size', type=int, default=64,
+                                  help='Batch size for validation.')
+        train_parser.add_argument('--learning_rate', type=float, default=3e-5,
+                                  help="Learning rate.")
+        train_parser.add_argument('--adam_epsilon', type=float, default=1e-08,
+                                  help="Epsilon for Adam optimizer.")
+        train_parser.set_defaults(func=train_command_factory)
+
+    def __init__(self, args: Namespace):
+        self.logger = getLogger('transformers-cli/training')
+
+        self.framework = 'tf' if is_tf_available() else 'torch'
+
+        os.makedirs(args.output, exist_ok=True)
+        assert os.path.isdir(args.output)
+        self.output = args.output
+
+        self.column_label = args.column_label
+        self.column_text = args.column_text
+        self.column_id = args.column_id
+
+        self.logger.info('Loading {} pipeline for {}'.format(args.task, args.model))
+        if args.task == 'text_classification':
+            self.pipeline = TextClassificationPipeline.from_pretrained(args.model)
+        elif args.task == 'token_classification':
+            raise NotImplementedError
+        elif args.task == 'question_answering':
+            raise NotImplementedError
+
+        self.logger.info('Loading dataset from {}'.format(args.train_data))
+        self.train_dataset = Processor.create_from_csv(args.train_data,
+                                                       column_label=args.column_label,
+                                                       column_text=args.column_text,
+                                                       column_id=args.column_id,
+                                                       skip_first_row=args.skip_first_row)
+        self.valid_dataset = None
+        if args.validation_data:
+            self.logger.info('Loading validation dataset from {}'.format(args.validation_data))
+            self.valid_dataset = Processor.create_from_csv(args.validation_data,
+                                                           column_label=args.column_label,
+                                                           column_text=args.column_text,
+                                                           column_id=args.column_id,
+                                                           skip_first_row=args.skip_first_row)
+
+        self.validation_split = args.validation_split
+        self.train_batch_size = args.train_batch_size
+        self.valid_batch_size = args.valid_batch_size
+        self.learning_rate = args.learning_rate
+        self.adam_epsilon = args.adam_epsilon
+
+    def run(self):
+        if self.framework == 'tf':
+            return self.run_tf()
+        return self.run_torch()
+
+    def run_torch(self):
+        raise NotImplementedError
+
+    def run_tf(self):
+        self.pipeline.fit(self.train_dataset,
+                          validation_data=self.valid_dataset,
+                          validation_split=self.validation_split,
+                          learning_rate=self.learning_rate,
+                          adam_epsilon=self.adam_epsilon,
+                          train_batch_size=self.train_batch_size,
+                          valid_batch_size=self.valid_batch_size)
+
+        # Save trained pipeline
+        self.pipeline.save_pretrained(self.output)
diff --git a/transformers/commands/user.py b/transformers/commands/user.py
index d79922ed8a..8e0e563422 100644
--- a/transformers/commands/user.py
+++ b/transformers/commands/user.py
@@ -19,8 +19,8 @@ class UserCommands(BaseTransformersCLICommand):
         list_parser.set_defaults(func=lambda args: ListObjsCommand(args))
         # upload
         upload_parser = parser.add_parser('upload')
-        upload_parser.add_argument('file', type=str, help='Local filepath of the file to upload.')
-        upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override object filename on S3.')
+        upload_parser.add_argument('path', type=str, help='Local path of the folder or individual file to upload.')
+        upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override individual object filename on S3.')
         upload_parser.set_defaults(func=lambda args: UploadCommand(args))
 
 
@@ -138,28 +138,57 @@ class ListObjsCommand(BaseUserCommand):
 
 
 class UploadCommand(BaseUserCommand):
+    def walk_dir(self, rel_path):
+        """
+        Recursively list all files in a folder.
+        """
+        entries: List[os.DirEntry] = list(os.scandir(rel_path))
+        files = [
+            (
+                os.path.join(os.getcwd(), f.path),  # filepath
+                f.path  # filename
+            )
+            for f in entries if f.is_file()
+        ]
+        for f in entries:
+            if f.is_dir():
+                files += self.walk_dir(f.path)
+        return files
+
     def run(self):
         token = HfFolder.get_token()
         if token is None:
             print("Not logged in")
             exit(1)
-        filepath = os.path.join(os.getcwd(), self.args.file)
-        filename = self.args.filename if self.args.filename is not None else os.path.basename(filepath)
-        print(
-            "About to upload file {} to S3 under filename {}".format(
-                ANSI.bold(filepath), ANSI.bold(filename)
+        local_path = os.path.abspath(self.args.path)
+        if os.path.isdir(local_path):
+            if self.args.filename is not None:
+                raise ValueError("Cannot specify a filename override when uploading a folder.")
+            rel_path = os.path.basename(local_path)
+            files = self.walk_dir(rel_path)
+        elif os.path.isfile(local_path):
+            filename = self.args.filename if self.args.filename is not None else os.path.basename(local_path)
+            files = [(local_path, filename)]
+        else:
+            raise ValueError("Not a valid file or directory: {}".format(local_path))
+
+        for filepath, filename in files:
+            print(
+                "About to upload file {} to S3 under filename {}".format(
+                    ANSI.bold(filepath), ANSI.bold(filename)
+                )
             )
-        )
 
         choice = input("Proceed? [Y/n] ").lower()
         if not(choice == "" or choice == "y" or choice == "yes"):
             print("Abort")
             exit()
         print(
-            ANSI.bold("Uploading... This might take a while if file is large")
+            ANSI.bold("Uploading... This might take a while if files are large")
         )
-        access_url = self._api.presign_and_upload(
-            token=token, filename=filename, filepath=filepath
-        )
-        print("Your file now lives at:")
-        print(access_url)
+        for filepath, filename in files:
+            access_url = self._api.presign_and_upload(
+                token=token, filename=filename, filepath=filepath
+            )
+            print("Your file now lives at:")
+            print(access_url)
diff --git a/transformers/configuration_albert.py b/transformers/configuration_albert.py
index de665c9b1c..6a1ef78dd5 100644
--- a/transformers/configuration_albert.py
+++ b/transformers/configuration_albert.py
@@ -37,7 +37,7 @@ class AlbertConfig(PretrainedConfig):
     pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file=30000,
+                 vocab_size=30000,
                  embedding_size=128,
                  hidden_size=4096,
                  num_hidden_layers=12,
@@ -83,7 +83,7 @@ class AlbertConfig(PretrainedConfig):
         """
         super(AlbertConfig, self).__init__(**kwargs)
 
-        self.vocab_size = vocab_size_or_config_json_file
+        self.vocab_size = vocab_size
         self.embedding_size = embedding_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
@@ -97,4 +97,4 @@ class AlbertConfig(PretrainedConfig):
         self.max_position_embeddings = max_position_embeddings
         self.type_vocab_size = type_vocab_size
         self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
\ No newline at end of file
+        self.layer_norm_eps = layer_norm_eps
diff --git a/transformers/configuration_auto.py b/transformers/configuration_auto.py
index 43f251bd0c..281256389e 100644
--- a/transformers/configuration_auto.py
+++ b/transformers/configuration_auto.py
@@ -18,21 +18,42 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
-from .configuration_bert import BertConfig
-from .configuration_openai import OpenAIGPTConfig
-from .configuration_gpt2 import GPT2Config
-from .configuration_transfo_xl import TransfoXLConfig
-from .configuration_xlnet import XLNetConfig
-from .configuration_xlm import XLMConfig
-from .configuration_roberta import RobertaConfig
-from .configuration_distilbert import DistilBertConfig
-from .configuration_ctrl import CTRLConfig
-from .configuration_camembert import CamembertConfig
-from .configuration_albert import AlbertConfig
+from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlm_roberta import XLMRobertaConfig, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
 
 logger = logging.getLogger(__name__)
 
 
+ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict((key, value)
+    for pretrained_map in [
+        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ]
+    for key, value, in pretrained_map.items())
+
+
 class AutoConfig(object):
     r""":class:`~transformers.AutoConfig` is a generic configuration class
         that will be instantiated as one of the configuration classes of the library
@@ -47,6 +68,7 @@ class AutoConfig(object):
             - contains `distilbert`: DistilBertConfig (DistilBERT model)
             - contains `albert`: AlbertConfig (ALBERT model)
             - contains `camembert`: CamembertConfig (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
             - contains `roberta`: RobertaConfig (RoBERTa model)
             - contains `bert`: BertConfig (Bert model)
             - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
@@ -61,6 +83,34 @@ class AutoConfig(object):
         raise EnvironmentError("AutoConfig is designed to be instantiated "
             "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.")
 
+    @classmethod
+    def for_model(cls, model_type, *args, **kwargs):
+        if 'distilbert' in model_type:
+            return DistilBertConfig(*args, **kwargs)
+        elif 'roberta' in model_type:
+            return RobertaConfig(*args, **kwargs)
+        elif 'bert' in model_type:
+            return BertConfig(*args, **kwargs)
+        elif 'openai-gpt' in model_type:
+            return OpenAIGPTConfig(*args, **kwargs)
+        elif 'gpt2' in model_type:
+            return GPT2Config(*args, **kwargs)
+        elif 'transfo-xl' in model_type:
+            return TransfoXLConfig(*args, **kwargs)
+        elif 'xlnet' in model_type:
+            return XLNetConfig(*args, **kwargs)
+        elif 'xlm' in model_type:
+            return XLMConfig(*args, **kwargs)
+        elif 'ctrl' in model_type:
+            return CTRLConfig(*args, **kwargs)
+        elif 'albert' in model_type:
+            return AlbertConfig(*args, **kwargs)
+        elif 'camembert' in model_type:
+            return CamembertConfig(*args, **kwargs)
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm', 'roberta', 'ctrl', 'camembert', 'albert'".format(model_type))
+
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         r""" Instantiate a one of the configuration classes of the library
@@ -68,9 +118,11 @@ class AutoConfig(object):
 
         The configuration class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Config (T5 model)
             - contains `distilbert`: DistilBertConfig (DistilBERT model)
             - contains `albert`: AlbertConfig (ALBERT model)
             - contains `camembert`: CamembertConfig (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
             - contains `roberta`: RobertaConfig (RoBERTa model)
             - contains `bert`: BertConfig (Bert model)
             - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
@@ -83,6 +135,7 @@ class AutoConfig(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
 
@@ -123,12 +176,16 @@ class AutoConfig(object):
             assert unused_kwargs == {'foo': False}
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return T5Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
             return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif 'albert' in pretrained_model_name_or_path:
             return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif 'camembert' in pretrained_model_name_or_path:
             return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
@@ -147,4 +204,4 @@ class AutoConfig(object):
             return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
+                         "'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py
index d63be963eb..7b495013ff 100644
--- a/transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -42,6 +42,12 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
     'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
     'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
+    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
+    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
+    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
+    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
+    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
+    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
 }
 
 
@@ -52,7 +58,7 @@ class BertConfig(PretrainedConfig):
 
 
         Arguments:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
             hidden_size: Size of the encoder layers and the pooler layer.
             num_hidden_layers: Number of hidden layers in the Transformer encoder.
             num_attention_heads: Number of attention heads for each attention layer in
@@ -77,7 +83,7 @@ class BertConfig(PretrainedConfig):
     pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file=30522,
+                 vocab_size=30522,
                  hidden_size=768,
                  num_hidden_layers=12,
                  num_attention_heads=12,
@@ -91,25 +97,15 @@ class BertConfig(PretrainedConfig):
                  layer_norm_eps=1e-12,
                  **kwargs):
         super(BertConfig, self).__init__(**kwargs)
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.hidden_act = hidden_act
-            self.intermediate_size = intermediate_size
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.initializer_range = initializer_range
-            self.layer_norm_eps = layer_norm_eps
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
diff --git a/transformers/configuration_ctrl.py b/transformers/configuration_ctrl.py
index fcbd848dec..f9b9e409e1 100644
--- a/transformers/configuration_ctrl.py
+++ b/transformers/configuration_ctrl.py
@@ -31,7 +31,7 @@ class CTRLConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `CTRLModel`.
 
     Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
+        vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
         n_positions: Number of positional embeddings.
         n_ctx: Size of the causal mask (usually same as n_positions).
         dff: Size of the inner dimension of the FFN.
@@ -52,7 +52,7 @@ class CTRLConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size_or_config_json_file=246534,
+        vocab_size=246534,
         n_positions=256,
         n_ctx=256,
         n_embd=1280,
@@ -64,8 +64,6 @@ class CTRLConfig(PretrainedConfig):
         attn_pdrop=0.1,
         layer_norm_epsilon=1e-6,
         initializer_range=0.02,
-
-        num_labels=1,
         summary_type='cls_index',
         summary_use_proj=True,
         summary_activation=None,
@@ -76,7 +74,7 @@ class CTRLConfig(PretrainedConfig):
         """Constructs CTRLConfig.
 
         Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
+            vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
             n_positions: Number of positional embeddings.
             n_ctx: Size of the causal mask (usually same as n_positions).
             dff: Size of the inner dimension of the FFN.
@@ -94,8 +92,7 @@ class CTRLConfig(PretrainedConfig):
                 initializing all weight matrices.
         """
         super(CTRLConfig, self).__init__(**kwargs)
-
-        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
+        self.vocab_size = vocab_size
         self.n_ctx = n_ctx
         self.n_positions = n_positions
         self.n_embd = n_embd
@@ -108,23 +105,11 @@ class CTRLConfig(PretrainedConfig):
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_range = initializer_range
 
-        self.num_labels = num_labels
         self.summary_type = summary_type
         self.summary_use_proj = summary_use_proj
         self.summary_activation = summary_activation
         self.summary_first_dropout = summary_first_dropout
         self.summary_proj_to_labels = summary_proj_to_labels
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif not isinstance(vocab_size_or_config_json_file, int):
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
 
     @property
     def max_position_embeddings(self):
diff --git a/transformers/configuration_distilbert.py b/transformers/configuration_distilbert.py
index d5d575be29..d9f7cc6348 100644
--- a/transformers/configuration_distilbert.py
+++ b/transformers/configuration_distilbert.py
@@ -37,7 +37,7 @@ class DistilBertConfig(PretrainedConfig):
     pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file=30522,
+                 vocab_size=30522,
                  max_position_embeddings=512,
                  sinusoidal_pos_embds=False,
                  n_layers=6,
@@ -53,31 +53,21 @@ class DistilBertConfig(PretrainedConfig):
                  seq_classif_dropout=0.2,
                  **kwargs):
         super(DistilBertConfig, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.sinusoidal_pos_embds = sinusoidal_pos_embds
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.dim = dim
+        self.hidden_dim = hidden_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation = activation
+        self.initializer_range = initializer_range
+        self.tie_weights_ = tie_weights_
+        self.qa_dropout = qa_dropout
+        self.seq_classif_dropout = seq_classif_dropout
 
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.max_position_embeddings = max_position_embeddings
-            self.sinusoidal_pos_embds = sinusoidal_pos_embds
-            self.n_layers = n_layers
-            self.n_heads = n_heads
-            self.dim = dim
-            self.hidden_dim = hidden_dim
-            self.dropout = dropout
-            self.attention_dropout = attention_dropout
-            self.activation = activation
-            self.initializer_range = initializer_range
-            self.tie_weights_ = tie_weights_
-            self.qa_dropout = qa_dropout
-            self.seq_classif_dropout = seq_classif_dropout
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
     @property
     def hidden_size(self):
         return self.dim
diff --git a/transformers/configuration_gpt2.py b/transformers/configuration_gpt2.py
index c2fb4948d3..4c200c0760 100644
--- a/transformers/configuration_gpt2.py
+++ b/transformers/configuration_gpt2.py
@@ -36,7 +36,7 @@ class GPT2Config(PretrainedConfig):
     """Configuration class to store the configuration of a `GPT2Model`.
 
     Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+        vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
         n_positions: Number of positional embeddings.
         n_ctx: Size of the causal mask (usually same as n_positions).
         n_embd: Dimensionality of the embeddings and hidden states.
@@ -56,7 +56,7 @@ class GPT2Config(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size_or_config_json_file=50257,
+        vocab_size=50257,
         n_positions=1024,
         n_ctx=1024,
         n_embd=768,
@@ -67,8 +67,6 @@ class GPT2Config(PretrainedConfig):
         attn_pdrop=0.1,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
-
-        num_labels=1,
         summary_type='cls_index',
         summary_use_proj=True,
         summary_activation=None,
@@ -79,7 +77,7 @@ class GPT2Config(PretrainedConfig):
         """Constructs GPT2Config.
 
         Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+            vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
             n_positions: Number of positional embeddings.
             n_ctx: Size of the causal mask (usually same as n_positions).
             n_embd: Dimensionality of the embeddings and hidden states.
@@ -96,37 +94,22 @@ class GPT2Config(PretrainedConfig):
                 initializing all weight matrices.
         """
         super(GPT2Config, self).__init__(**kwargs)
-
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.n_ctx = n_ctx
-            self.n_positions = n_positions
-            self.n_embd = n_embd
-            self.n_layer = n_layer
-            self.n_head = n_head
-            self.resid_pdrop = resid_pdrop
-            self.embd_pdrop = embd_pdrop
-            self.attn_pdrop = attn_pdrop
-            self.layer_norm_epsilon = layer_norm_epsilon
-            self.initializer_range = initializer_range
-
-            self.num_labels = num_labels
-            self.summary_type = summary_type
-            self.summary_use_proj = summary_use_proj
-            self.summary_activation = summary_activation
-            self.summary_first_dropout = summary_first_dropout
-            self.summary_proj_to_labels = summary_proj_to_labels
-        else:
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
+        self.vocab_size = vocab_size
+        self.n_ctx = n_ctx
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_first_dropout = summary_first_dropout
+        self.summary_proj_to_labels = summary_proj_to_labels
 
     @property
     def max_position_embeddings(self):
diff --git a/transformers/configuration_openai.py b/transformers/configuration_openai.py
index 886b7f5bc5..7776a0bb9f 100644
--- a/transformers/configuration_openai.py
+++ b/transformers/configuration_openai.py
@@ -35,7 +35,7 @@ class OpenAIGPTConfig(PretrainedConfig):
     Configuration class to store the configuration of a `OpenAIGPTModel`.
 
     Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
+        vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
         n_positions: Number of positional embeddings.
         n_ctx: Size of the causal mask (usually same as n_positions).
         n_embd: Dimensionality of the embeddings and hidden states.
@@ -58,7 +58,7 @@ class OpenAIGPTConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size_or_config_json_file=40478,
+        vocab_size=40478,
         n_positions=512,
         n_ctx=512,
         n_embd=768,
@@ -71,8 +71,6 @@ class OpenAIGPTConfig(PretrainedConfig):
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
         predict_special_tokens=True,
-
-        num_labels=1,
         summary_type='cls_index',
         summary_use_proj=True,
         summary_activation=None,
@@ -83,39 +81,24 @@ class OpenAIGPTConfig(PretrainedConfig):
         """Constructs OpenAIGPTConfig.
         """
         super(OpenAIGPTConfig, self).__init__(**kwargs)
-
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.n_ctx = n_ctx
-            self.n_positions = n_positions
-            self.n_embd = n_embd
-            self.n_layer = n_layer
-            self.n_head = n_head
-            self.afn = afn
-            self.resid_pdrop = resid_pdrop
-            self.embd_pdrop = embd_pdrop
-            self.attn_pdrop = attn_pdrop
-            self.layer_norm_epsilon = layer_norm_epsilon
-            self.initializer_range = initializer_range
-            self.predict_special_tokens = predict_special_tokens
-
-            self.num_labels = num_labels
-            self.summary_type = summary_type
-            self.summary_use_proj = summary_use_proj
-            self.summary_activation = summary_activation
-            self.summary_first_dropout = summary_first_dropout
-            self.summary_proj_to_labels = summary_proj_to_labels
-        else:
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
+        self.vocab_size = vocab_size
+        self.n_ctx = n_ctx
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.afn = afn
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.predict_special_tokens = predict_special_tokens
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_first_dropout = summary_first_dropout
+        self.summary_proj_to_labels = summary_proj_to_labels
 
     @property
     def max_position_embeddings(self):
diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py
new file mode 100644
index 0000000000..377a0919d9
--- /dev/null
+++ b/transformers/configuration_t5.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# Copyright 2010, The T5 Authors and HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" T5 model configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+import six
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
+    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
+    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
+    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
+    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
+}
+
+
+class T5Config(PretrainedConfig):
+    r"""
+        :class:`~transformers.T5Config` is the configuration class to store the configuration of a
+        `T5Model`.
+
+
+        Arguments:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `T5Model`.
+            initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
+            layer_norm_eps: The epsilon used by LayerNorm.
+    """
+    pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size=32128,
+                 n_positions=512,
+                 d_model=512,
+                 d_kv=64,
+                 d_ff=2048,
+                 num_layers=6,
+                 num_heads=8,
+                 relative_attention_num_buckets=32,
+                 dropout_rate=0.1,
+                 layer_norm_epsilon=1e-6,
+                 initializer_factor=1.0,
+                 **kwargs):
+        super(T5Config, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.d_model = d_model
+        self.d_kv = d_kv
+        self.d_ff = d_ff
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.dropout_rate = dropout_rate
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_factor = initializer_factor
+
+    @property
+    def max_position_embeddings(self):
+        return self.n_positions
+
+    @property
+    def hidden_size(self):
+        return self.d_model
+
+    @property
+    def num_attention_heads(self):
+        return self.num_heads
+
+    @property
+    def num_hidden_layers(self):
+        return self.num_layers
diff --git a/transformers/configuration_transfo_xl.py b/transformers/configuration_transfo_xl.py
index d55a6adbe6..52f0f45a50 100644
--- a/transformers/configuration_transfo_xl.py
+++ b/transformers/configuration_transfo_xl.py
@@ -34,7 +34,7 @@ class TransfoXLConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `TransfoXLModel`.
 
         Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
+            vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
             cutoffs: cutoffs for the adaptive softmax
             d_model: Dimensionality of the model's hidden states.
             d_embed: Dimensionality of the embeddings
@@ -68,7 +68,7 @@ class TransfoXLConfig(PretrainedConfig):
     pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file=267735,
+                 vocab_size=267735,
                  cutoffs=[20000, 40000, 200000],
                  d_model=1024,
                  d_embed=1024,
@@ -100,7 +100,7 @@ class TransfoXLConfig(PretrainedConfig):
         """Constructs TransfoXLConfig.
         """
         super(TransfoXLConfig, self).__init__(**kwargs)
-        self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
+        self.vocab_size = vocab_size
         self.cutoffs = []
         self.cutoffs.extend(cutoffs)
         self.tie_weight = tie_weight
@@ -133,27 +133,17 @@ class TransfoXLConfig(PretrainedConfig):
         self.init_std = init_std
         self.layer_norm_epsilon = layer_norm_epsilon
 
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif not isinstance(vocab_size_or_config_json_file, int):
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
-
     @property
     def max_position_embeddings(self):
         return self.tgt_len + self.ext_len + self.mem_len
 
     @property
-    def vocab_size(self):
-        return self.n_token
+    def n_token(self):  # Backward compatibility
+        return self.vocab_size
 
-    @vocab_size.setter
-    def vocab_size(self, value):
-        self.n_token = value
+    @n_token.setter
+    def n_token(self, value):  # Backward compatibility
+        self.vocab_size = value
 
     @property
     def hidden_size(self):
diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py
index ceb032a57c..d2d6ee5d80 100644
--- a/transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -24,7 +24,7 @@ import logging
 import os
 from io import open
 
-from .file_utils import cached_path, CONFIG_NAME
+from .file_utils import CONFIG_NAME, cached_path, is_remote_url, hf_bucket_url
 
 logger = logging.getLogger(__name__)
 
@@ -49,8 +49,7 @@ class PretrainedConfig(object):
     pretrained_config_archive_map = {}
 
     def __init__(self, **kwargs):
-        self.finetuning_task = kwargs.pop('finetuning_task', None)
-        self.num_labels = kwargs.pop('num_labels', 2)
+        # Attributes with defaults
         self.output_attentions = kwargs.pop('output_attentions', False)
         self.output_hidden_states = kwargs.pop('output_hidden_states', False)
         self.output_past = kwargs.pop('output_past', True)  # Not used by all models
@@ -75,6 +74,22 @@ class PretrainedConfig(object):
         self.length_penalty = kwargs.pop('length_penalty', 1.)
         self.num_return_sequences = kwargs.pop('num_return_sequences', 1)
 
+        # Fine-tuning task arguments
+        self.finetuning_task = kwargs.pop('finetuning_task', None)
+        self.num_labels = kwargs.pop('num_labels', 2)
+        self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)})
+        self.id2label = dict((int(key), value) for key, value in self.id2label.items())
+        self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys())))
+        self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
+
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error("Can't set {} with value {} for {}".format(key, value, self))
+                raise err
+
     def save_pretrained(self, save_directory):
         """ Save a configuration object to the directory `save_directory`, so that it
             can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
@@ -95,6 +110,7 @@ class PretrainedConfig(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
 
@@ -147,12 +163,18 @@ class PretrainedConfig(object):
             config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
         elif os.path.isdir(pretrained_model_name_or_path):
             config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        else:
+        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
             config_file = pretrained_model_name_or_path
-        # redirect to the cache, if necessary
+        else:
+            config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
+
         try:
+            # Load from URL or cache if already cached
             resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
                                                proxies=proxies, resume_download=resume_download)
+            # Load config
+            config = cls.from_json_file(resolved_config_file)
+
         except EnvironmentError:
             if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
                 msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
@@ -166,15 +188,18 @@ class PretrainedConfig(object):
                         config_file, CONFIG_NAME)
             raise EnvironmentError(msg)
 
+        except json.JSONDecodeError:
+            msg = "Couldn't reach server at '{}' to download configuration file or " \
+                  "configuration file is not a valid JSON file. " \
+                  "Please check network or file content here: {}.".format(config_file, resolved_config_file)
+            raise EnvironmentError(msg)
+
         if resolved_config_file == config_file:
             logger.info("loading configuration file {}".format(config_file))
         else:
             logger.info("loading configuration file {} from cache at {}".format(
                 config_file, resolved_config_file))
 
-        # Load config
-        config = cls.from_json_file(resolved_config_file)
-
         if hasattr(config, 'pruned_heads'):
             config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
 
@@ -196,17 +221,15 @@ class PretrainedConfig(object):
     @classmethod
     def from_dict(cls, json_object):
         """Constructs a `Config` from a Python dictionary of parameters."""
-        config = cls(vocab_size_or_config_json_file=-1)
-        for key, value in json_object.items():
-            setattr(config, key, value)
-        return config
+        return cls(**json_object)
 
     @classmethod
     def from_json_file(cls, json_file):
-        """Constructs a `BertConfig` from a json file of parameters."""
+        """Constructs a `Config` from a json file of parameters."""
         with open(json_file, "r", encoding='utf-8') as reader:
             text = reader.read()
-        return cls.from_dict(json.loads(text))
+        dict_obj = json.loads(text)
+        return cls(**dict_obj)
 
     def __eq__(self, other):
         return self.__dict__ == other.__dict__
diff --git a/transformers/configuration_xlm.py b/transformers/configuration_xlm.py
index 1134c7ab61..727f319778 100644
--- a/transformers/configuration_xlm.py
+++ b/transformers/configuration_xlm.py
@@ -42,7 +42,7 @@ class XLMConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `XLMModel`.
 
     Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
+        vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`.
         d_model: Size of the encoder layers and the pooler layer.
         n_layer: Number of hidden layers in the Transformer encoder.
         n_head: Number of attention heads for each attention layer in
@@ -81,7 +81,7 @@ class XLMConfig(PretrainedConfig):
     pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file=30145,
+                 vocab_size=30145,
                  emb_dim=2048,
                  n_layers=12,
                  n_heads=16,
@@ -103,9 +103,6 @@ class XLMConfig(PretrainedConfig):
                  unk_index=3,
                  mask_index=5,
                  is_encoder=True,
-
-                 finetuning_task=None,
-                 num_labels=2,
                  summary_type='first',
                  summary_use_proj=True,
                  summary_activation=None,
@@ -119,58 +116,48 @@ class XLMConfig(PretrainedConfig):
         """Constructs XLMConfig.
         """
         super(XLMConfig, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.emb_dim = emb_dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.gelu_activation = gelu_activation
+        self.sinusoidal_embeddings = sinusoidal_embeddings
+        self.causal = causal
+        self.asm = asm
+        self.n_langs = n_langs
+        self.use_lang_emb = use_lang_emb
+        self.layer_norm_eps = layer_norm_eps
+        self.bos_index = bos_index
+        self.eos_index = eos_index
+        self.pad_index = pad_index
+        self.unk_index = unk_index
+        self.mask_index = mask_index
+        self.is_encoder = is_encoder
+        self.max_position_embeddings = max_position_embeddings
+        self.embed_init_std = embed_init_std
+        self.init_std = init_std
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_proj_to_labels = summary_proj_to_labels
+        self.summary_first_dropout = summary_first_dropout
+        self.start_n_top = start_n_top
+        self.end_n_top = end_n_top
+        self.mask_token_id = mask_token_id
+        self.lang_id = lang_id
 
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.n_words = vocab_size_or_config_json_file
-            self.emb_dim = emb_dim
-            self.n_layers = n_layers
-            self.n_heads = n_heads
-            self.dropout = dropout
-            self.attention_dropout = attention_dropout
-            self.gelu_activation = gelu_activation
-            self.sinusoidal_embeddings = sinusoidal_embeddings
-            self.causal = causal
-            self.asm = asm
-            self.n_langs = n_langs
-            self.use_lang_emb = use_lang_emb
-            self.layer_norm_eps = layer_norm_eps
-            self.bos_index = bos_index
-            self.eos_index = eos_index
-            self.pad_index = pad_index
-            self.unk_index = unk_index
-            self.mask_index = mask_index
-            self.is_encoder = is_encoder
-            self.max_position_embeddings = max_position_embeddings
-            self.embed_init_std = embed_init_std
-            self.init_std = init_std
-            self.finetuning_task = finetuning_task
-            self.num_labels = num_labels
-            self.summary_type = summary_type
-            self.summary_use_proj = summary_use_proj
-            self.summary_activation = summary_activation
-            self.summary_proj_to_labels = summary_proj_to_labels
-            self.summary_first_dropout = summary_first_dropout
-            self.start_n_top = start_n_top
-            self.end_n_top = end_n_top
-            self.mask_token_id = mask_token_id
-            self.lang_id = lang_id
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
+        if "n_words" in kwargs:
+            self.n_words = kwargs["n_words"]
 
     @property
-    def vocab_size(self):
-        return self.n_words
+    def n_words(self):  # For backward compatibility
+        return self.vocab_size
 
-    @vocab_size.setter
-    def vocab_size(self, value):
-        self.n_words = value
+    @n_words.setter
+    def n_words(self, value):  # For backward compatibility
+        self.vocab_size = value
 
     @property
     def hidden_size(self):
diff --git a/transformers/configuration_xlm_roberta.py b/transformers/configuration_xlm_roberta.py
new file mode 100644
index 0000000000..5b6955f4f8
--- /dev/null
+++ b/transformers/configuration_xlm_roberta.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XLM-RoBERTa configuration """
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+
+from .configuration_roberta import RobertaConfig
+
+logger = logging.getLogger(__name__)
+
+XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json",
+    'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json",
+    'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json",
+    'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json",
+    'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json",
+    'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json",
+}
+
+
+class XLMRobertaConfig(RobertaConfig):
+    pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
diff --git a/transformers/configuration_xlnet.py b/transformers/configuration_xlnet.py
index 0dbf518849..017c57cfd5 100644
--- a/transformers/configuration_xlnet.py
+++ b/transformers/configuration_xlnet.py
@@ -35,7 +35,7 @@ class XLNetConfig(PretrainedConfig):
     """Configuration class to store the configuration of a ``XLNetModel``.
 
     Args:
-        vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
+        vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
         d_model: Size of the encoder layers and the pooler layer.
         n_layer: Number of hidden layers in the Transformer encoder.
         n_head: Number of attention heads for each attention layer in
@@ -72,28 +72,22 @@ class XLNetConfig(PretrainedConfig):
     pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file=32000,
+                 vocab_size=32000,
                  d_model=1024,
                  n_layer=24,
                  n_head=16,
                  d_inner=4096,
-                 max_position_embeddings=512,
                  ff_activation="gelu",
                  untie_r=True,
                  attn_type="bi",
-
                  initializer_range=0.02,
                  layer_norm_eps=1e-12,
-
                  dropout=0.1,
                  mem_len=None,
                  reuse_len=None,
                  bi_data=False,
                  clamp_len=-1,
                  same_length=False,
-
-                 finetuning_task=None,
-                 num_labels=2,
                  summary_type='last',
                  summary_use_proj=True,
                  summary_activation='tanh',
@@ -104,58 +98,45 @@ class XLNetConfig(PretrainedConfig):
         """Constructs XLNetConfig.
         """
         super(XLNetConfig, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.n_layer = n_layer
+        self.n_head = n_head
+        assert d_model % n_head == 0
+        self.d_head = d_model // n_head
+        self.ff_activation = ff_activation
+        self.d_inner = d_inner
+        self.untie_r = untie_r
+        self.attn_type = attn_type
 
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                setattr(config, key, value)
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.n_token = vocab_size_or_config_json_file
-            self.d_model = d_model
-            self.n_layer = n_layer
-            self.n_head = n_head
-            assert d_model % n_head == 0
-            self.d_head = d_model // n_head
-            self.ff_activation = ff_activation
-            self.d_inner = d_inner
-            self.untie_r = untie_r
-            self.attn_type = attn_type
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
 
-            self.initializer_range = initializer_range
-            self.layer_norm_eps = layer_norm_eps
+        self.dropout = dropout
+        self.mem_len = mem_len
+        self.reuse_len = reuse_len
+        self.bi_data = bi_data
+        self.clamp_len = clamp_len
+        self.same_length = same_length
 
-            self.dropout = dropout
-            self.mem_len = mem_len
-            self.reuse_len = reuse_len
-            self.bi_data = bi_data
-            self.clamp_len = clamp_len
-            self.same_length = same_length
-
-            self.finetuning_task = finetuning_task
-            self.num_labels = num_labels
-            self.summary_type = summary_type
-            self.summary_use_proj = summary_use_proj
-            self.summary_activation = summary_activation
-            self.summary_last_dropout = summary_last_dropout
-            self.start_n_top = start_n_top
-            self.end_n_top = end_n_top
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_last_dropout = summary_last_dropout
+        self.start_n_top = start_n_top
+        self.end_n_top = end_n_top
 
     @property
     def max_position_embeddings(self):
         return -1
 
     @property
-    def vocab_size(self):
-        return self.n_token
+    def n_token(self):  # Backward compatibility
+        return self.vocab_size
 
-    @vocab_size.setter
-    def vocab_size(self, value):
-        self.n_token = value
+    @n_token.setter
+    def n_token(self, value):  # Backward compatibility
+        self.vocab_size = value
 
     @property
     def hidden_size(self):
diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
index d1776e9c14..0edac6fb7d 100644
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -32,9 +32,10 @@ from transformers import (load_pytorch_checkpoint_in_tf2_model,
                                   TransfoXLConfig, TFTransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                   OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                   RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, TFDistilBertForSequenceClassification, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                   CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
+                                  AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  T5Config, TFT5WithLMHeadModel, T5_PRETRAINED_CONFIG_ARCHIVE_MAP)
 
 if is_torch_available():
     import torch
@@ -46,9 +47,10 @@ if is_torch_available():
                                       TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
                                       OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                       RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                       CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+                                      AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP)
 else:
     (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
@@ -57,9 +59,10 @@ else:
     TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
     OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
     RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-    DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    DistilBertForMaskedLM, DistilBertForSequenceClassification, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
-    AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP) = (
+    AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) = (
         None, None, None, None,
         None, None,
         None, None,
@@ -67,7 +70,8 @@ else:
         None, None,
         None, None,
         None, None, None,
-        None, None, None,
+        None, None, None, None,
+        None, None,
         None, None,
         None, None)
 
@@ -89,8 +93,10 @@ MODEL_CLASSES = {
     'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
     'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
     'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
     'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
+    'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    't5': (T5Config, TFT5WithLMHeadModel, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP),
 }
 
 def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
@@ -115,23 +121,21 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
     tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path)
 
     if compare_with_pt_model:
-        inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-        tf_inputs = tf.constant(inputs_list)
-        tfo = tf_model(tf_inputs, training=False)  # build the network
+        tfo = tf_model(tf_model.dummy_inputs, training=False)  # build the network
 
-        pt_model = pt_model_class.from_pretrained(None,
+        state_dict = torch.load(pytorch_checkpoint_path, map_location='cpu')
+        pt_model = pt_model_class.from_pretrained(pretrained_model_name_or_path=None,
                                                   config=config,
-                                                  state_dict=torch.load(pytorch_checkpoint_path,
-                                                                        map_location='cpu'))
-        pt_inputs = torch.tensor(inputs_list)
-        with torch.no_grad():
-            pto = pt_model(pt_inputs)
+                                                  state_dict=state_dict)
 
-        np_pt = pto[0].detach().numpy()
+        with torch.no_grad():
+            pto = pt_model(**pt_model.dummy_inputs)
+
+        np_pt = pto[0].numpy()
         np_tf = tfo[0].numpy()
         diff = np.amax(np.abs(np_pt - np_tf))
         print("Max absolute difference between models outputs {}".format(diff))
-        assert diff <= 2e-2, "Error, model absolute difference is >2e-2"
+        assert diff <= 2e-2, "Error, model absolute difference is >2e-2: {}".format(diff)
 
     # Save pytorch-model
     print("Save TensorFlow model to {}".format(tf_dump_path))
@@ -139,7 +143,7 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
 
 
 def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
-                                     compare_with_pt_model=False, use_cached_models=False, only_convert_finetuned_models=False):
+                                     compare_with_pt_model=False, use_cached_models=False, remove_cached_files=False, only_convert_finetuned_models=False):
     assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
 
     if args_model_type is None:
@@ -187,13 +191,15 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
 
             if os.path.isfile(model_shortcut_name):
                 model_shortcut_name = 'converted_model'
+
             convert_pt_checkpoint_to_tf(model_type=model_type,
                                         pytorch_checkpoint_path=model_file,
                                         config_file=config_file,
                                         tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
                                         compare_with_pt_model=compare_with_pt_model)
-            os.remove(config_file)
-            os.remove(model_file)
+            if remove_cached_files:
+                os.remove(config_file)
+                os.remove(model_file)
 
 
 if __name__ == "__main__":
@@ -226,6 +232,9 @@ if __name__ == "__main__":
     parser.add_argument("--use_cached_models",
                         action='store_true',
                         help = "Use cached models if possible instead of updating to latest checkpoint versions.")
+    parser.add_argument("--remove_cached_files",
+                        action='store_true',
+                        help = "Remove pytorch models after conversion (save memory when converting in batches).")
     parser.add_argument("--only_convert_finetuned_models",
                         action='store_true',
                         help = "Only convert finetuned models.")
@@ -245,4 +254,5 @@ if __name__ == "__main__":
                                         config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
                                         compare_with_pt_model=args.compare_with_pt_model,
                                         use_cached_models=args.use_cached_models,
+                                        remove_cached_files=args.remove_cached_files,
                                         only_convert_finetuned_models=args.only_convert_finetuned_models)
diff --git a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
index 60935add60..fedfc1ecb8 100644
--- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -20,6 +20,13 @@ import argparse
 import logging
 import numpy as np
 import torch
+import pathlib
+
+import fairseq
+from packaging import version
+
+if version.parse(fairseq.__version__) < version.parse("0.9.0"):
+    raise Exception("requires fairseq >= 0.9.0")
 
 from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 from fairseq.modules import TransformerSentenceEncoderLayer
@@ -45,8 +52,9 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
     """
     roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
     roberta.eval()  # disable dropout
+    roberta_sent_encoder = roberta.model.decoder.sentence_encoder
     config = BertConfig(
-        vocab_size_or_config_json_file=50265,
+        vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
         hidden_size=roberta.args.encoder_embed_dim,
         num_hidden_layers=roberta.args.encoder_layers,
         num_attention_heads=roberta.args.encoder_attention_heads,
@@ -64,7 +72,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
 
     # Now let's copy all the weights.
     # Embeddings
-    roberta_sent_encoder = roberta.model.decoder.sentence_encoder
     model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
     model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
     model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight)  # just zero them out b/c RoBERTa doesn't use them.
@@ -79,15 +86,18 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
         ### self attention
         self_attn: BertSelfAttention = layer.attention.self
         assert(
-            roberta_layer.self_attn.in_proj_weight.shape == torch.Size((3 * config.hidden_size, config.hidden_size))
+            roberta_layer.self_attn.k_proj.weight.data.shape == \
+            roberta_layer.self_attn.q_proj.weight.data.shape == \
+            roberta_layer.self_attn.v_proj.weight.data.shape == \
+            torch.Size((config.hidden_size, config.hidden_size))
         )
-        # we use three distinct linear layers so we split the source layer here.
-        self_attn.query.weight.data = roberta_layer.self_attn.in_proj_weight[:config.hidden_size, :]
-        self_attn.query.bias.data = roberta_layer.self_attn.in_proj_bias[:config.hidden_size]
-        self_attn.key.weight.data = roberta_layer.self_attn.in_proj_weight[config.hidden_size:2*config.hidden_size, :]
-        self_attn.key.bias.data = roberta_layer.self_attn.in_proj_bias[config.hidden_size:2*config.hidden_size]
-        self_attn.value.weight.data = roberta_layer.self_attn.in_proj_weight[2*config.hidden_size:, :]
-        self_attn.value.bias.data = roberta_layer.self_attn.in_proj_bias[2*config.hidden_size:]
+
+        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
+        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
+        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
+        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
+        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
+        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
 
         ### self-attention output
         self_output: BertSelfOutput = layer.attention.output
@@ -151,6 +161,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
     if not success:
         raise Exception("Something went wRoNg")
 
+    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
     print(f"Saving model to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
 
diff --git a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
new file mode 100755
index 0000000000..2b74d2dd93
--- /dev/null
+++ b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,65 @@
+# coding=utf-8
+# Copyright 2018 The T5 authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert T5 checkpoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import torch
+
+from transformers import T5Config, T5Model, load_tf_weights_in_t5
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
+    # Initialise PyTorch model
+    config = T5Config.from_json_file(config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = T5Model(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_t5(model, config, tf_checkpoint_path)
+
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    torch.save(model.state_dict(), pytorch_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--tf_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the TensorFlow checkpoint path.")
+    parser.add_argument("--config_file",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "The config json file corresponding to the pre-trained T5 model. \n"
+                            "This specifies the model architecture.")
+    parser.add_argument("--pytorch_dump_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    args = parser.parse_args()
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
+                                     args.config_file,
+                                     args.pytorch_dump_path)
diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py
index 270a053268..5567952fd2 100644
--- a/transformers/data/__init__.py
+++ b/transformers/data/__init__.py
@@ -1,4 +1,4 @@
-from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures
+from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures, SingleSentenceClassificationProcessor
 from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
 from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor
 from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py
index 0755c0ab7a..7b03255f49 100644
--- a/transformers/data/metrics/squad_metrics.py
+++ b/transformers/data/metrics/squad_metrics.py
@@ -695,7 +695,12 @@ def compute_predictions_log_probs(
             tok_text = " ".join(tok_text.split())
             orig_text = " ".join(orig_tokens)
 
-            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case,
+            if hasattr(tokenizer, "do_lower_case"):
+                do_lower_case = tokenizer.do_lower_case
+            else:
+                do_lower_case = tokenizer.do_lowercase_and_remove_accent
+
+            final_text = get_final_text(tok_text, orig_text, do_lower_case,
                                         verbose_logging)
 
             if final_text in seen_predictions:
diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py
index 0f1b24893a..4f7307bb7b 100644
--- a/transformers/data/processors/__init__.py
+++ b/transformers/data/processors/__init__.py
@@ -1,4 +1,4 @@
-from .utils import InputExample, InputFeatures, DataProcessor
+from .utils import InputExample, InputFeatures, DataProcessor, SingleSentenceClassificationProcessor
 from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
 from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor
 from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
\ No newline at end of file
diff --git a/transformers/data/processors/glue.py b/transformers/data/processors/glue.py
index 518251b050..11ebd949de 100644
--- a/transformers/data/processors/glue.py
+++ b/transformers/data/processors/glue.py
@@ -133,7 +133,7 @@ def glue_convert_examples_to_features(examples, tokenizer,
     if is_tf_available() and is_tf_dataset:
         def gen():
             for ex in features:
-                yield  ({'input_ids': ex.input_ids,
+                yield ({'input_ids': ex.input_ids,
                          'attention_mask': ex.attention_mask,
                          'token_type_ids': ex.token_type_ids},
                         ex.label)
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 3d7f832540..8e72bbbd6d 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -18,19 +18,20 @@ if is_tf_available():
 
 logger = logging.getLogger(__name__)
 
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                        orig_answer_text):
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
     """Returns tokenized answer spans that better match the annotated answer."""
     tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
 
     for new_start in range(input_start, input_end + 1):
         for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
             if text_span == tok_answer_text:
                 return (new_start, new_end)
 
     return (input_start, input_end)
 
+
 def _check_is_max_context(doc_spans, cur_span_index, position):
     """Check if this is the 'max context' doc span for the token."""
     best_score = None
@@ -50,10 +51,11 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
 
     return cur_span_index == best_span_index
 
+
 def _new_check_is_max_context(doc_spans, cur_span_index, position):
     """Check if this is the 'max context' doc span for the token."""
     # if len(doc_spans) == 1:
-        # return True
+    # return True
     best_score = None
     best_span_index = None
     for (span_index, doc_span) in enumerate(doc_spans):
@@ -71,14 +73,16 @@ def _new_check_is_max_context(doc_spans, cur_span_index, position):
 
     return cur_span_index == best_span_index
 
+
 def _is_whitespace(c):
     if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
         return True
     return False
 
-def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                       doc_stride, max_query_length, is_training, 
-                                       return_dataset=False):
+
+def squad_convert_examples_to_features(
+    examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False
+):
     """
     Converts a list of examples into a list of features that can be directly given as input to a model.
     It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
@@ -112,24 +116,23 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         )
     """
 
-    # Defining helper methods    
+    # Defining helper methods
     unique_id = 1000000000
 
     features = []
-    for (example_index, example) in enumerate(tqdm(examples)):
+    for (example_index, example) in enumerate(tqdm(examples, desc="Converting examples to features")):
         if is_training and not example.is_impossible:
             # Get start and end position
             start_position = example.start_position
             end_position = example.end_position
 
             # If the answer cannot be found in the text, then skip this example.
-            actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)])
+            actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
             cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
             if actual_text.find(cleaned_answer_text) == -1:
                 logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
                 continue
 
-
         tok_to_orig_index = []
         orig_to_tok_index = []
         all_doc_tokens = []
@@ -140,7 +143,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                 tok_to_orig_index.append(i)
                 all_doc_tokens.append(sub_token)
 
-
         if is_training and not example.is_impossible:
             tok_start_position = orig_to_tok_index[example.start_position]
             if example.end_position < len(example.doc_tokens) - 1:
@@ -153,36 +155,41 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
             )
 
         spans = []
-        
-        truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
-        sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence 
-        sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair 
+
+        truncated_query = tokenizer.encode(
+            example.question_text, add_special_tokens=False, max_length=max_query_length
+        )
+        sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence
+        sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
 
         span_doc_tokens = all_doc_tokens
         while len(spans) * doc_stride < len(all_doc_tokens):
-            
+
             encoded_dict = tokenizer.encode_plus(
-                truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, 
-                span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, 
-                max_length=max_seq_length, 
-                return_overflowing_tokens=True, 
+                truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
+                span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
+                max_length=max_seq_length,
+                return_overflowing_tokens=True,
                 pad_to_max_length=True,
                 stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
-                truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first'
+                truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
             )
 
-            paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
+            paragraph_len = min(
+                len(all_doc_tokens) - len(spans) * doc_stride,
+                max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
+            )
 
-            if tokenizer.pad_token_id in encoded_dict['input_ids']: 
-                non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)]
+            if tokenizer.pad_token_id in encoded_dict["input_ids"]:
+                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
             else:
-                non_padded_ids = encoded_dict['input_ids']
+                non_padded_ids = encoded_dict["input_ids"]
 
             tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
 
             token_to_orig_map = {}
             for i in range(paragraph_len):
-                index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i 
+                index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
                 token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
 
             encoded_dict["paragraph_len"] = paragraph_len
@@ -202,16 +209,20 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         for doc_span_index in range(len(spans)):
             for j in range(spans[doc_span_index]["paragraph_len"]):
                 is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
-                index = j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+                index = (
+                    j
+                    if tokenizer.padding_side == "left"
+                    else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+                )
                 spans[doc_span_index]["token_is_max_context"][index] = is_max_context
 
         for span in spans:
             # Identify the position of the CLS token
-            cls_index = span['input_ids'].index(tokenizer.cls_token_id)
+            cls_index = span["input_ids"].index(tokenizer.cls_token_id)
 
             # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
             # Original TF implem also keep the classification token (set to 0) (not sure why...)
-            p_mask = np.array(span['token_type_ids'])
+            p_mask = np.array(span["token_type_ids"])
 
             p_mask = np.minimum(p_mask, 1)
 
@@ -224,7 +235,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
             # Set the CLS index to '0'
             p_mask[cls_index] = 0
 
-
             span_is_impossible = example.is_impossible
             start_position = 0
             end_position = 0
@@ -247,55 +257,99 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                         doc_offset = 0
                     else:
                         doc_offset = len(truncated_query) + sequence_added_tokens
-                        
+
                     start_position = tok_start_position - doc_start + doc_offset
                     end_position = tok_end_position - doc_start + doc_offset
 
-
-            features.append(SquadFeatures(
-                span['input_ids'],
-                span['attention_mask'],
-                span['token_type_ids'],
-                cls_index,
-                p_mask.tolist(),
-
-                example_index=example_index,
-                unique_id=unique_id,
-                paragraph_len=span['paragraph_len'],
-                token_is_max_context=span["token_is_max_context"],
-                tokens=span["tokens"],
-                token_to_orig_map=span["token_to_orig_map"],
-                
-                start_position=start_position,
-                end_position=end_position
-            ))
+            features.append(
+                SquadFeatures(
+                    span["input_ids"],
+                    span["attention_mask"],
+                    span["token_type_ids"],
+                    cls_index,
+                    p_mask.tolist(),
+                    example_index=example_index,
+                    unique_id=unique_id,
+                    paragraph_len=span["paragraph_len"],
+                    token_is_max_context=span["token_is_max_context"],
+                    tokens=span["tokens"],
+                    token_to_orig_map=span["token_to_orig_map"],
+                    start_position=start_position,
+                    end_position=end_position,
+                )
+            )
 
             unique_id += 1
 
-    if return_dataset == 'pt':
+    if return_dataset == "pt":
         if not is_torch_available():
             raise ImportError("Pytorch must be installed to return a pytorch dataset.")
 
         # Convert to Tensors and build dataset
         all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
         all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
         all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
 
         if not is_training:
             all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-            dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                    all_example_index, all_cls_index, all_p_mask)
+            dataset = TensorDataset(
+                all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask
+            )
         else:
             all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
             all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-            dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                    all_start_positions, all_end_positions,
-                                    all_cls_index, all_p_mask)
+            dataset = TensorDataset(
+                all_input_ids,
+                all_attention_masks,
+                all_token_type_ids,
+                all_start_positions,
+                all_end_positions,
+                all_cls_index,
+                all_p_mask,
+            )
 
         return features, dataset
-        
+    elif return_dataset == "tf":
+        if not is_tf_available():
+            raise ImportError("TensorFlow must be installed to return a TensorFlow dataset.")
+
+        def gen():
+            for ex in features:
+                yield (
+                    {
+                        "input_ids": ex.input_ids,
+                        "attention_mask": ex.attention_mask,
+                        "token_type_ids": ex.token_type_ids,
+                    }, {
+                        "start_position": ex.start_position,
+                        "end_position": ex.end_position,
+                        "cls_index": ex.cls_index,
+                        "p_mask": ex.p_mask,
+                    }
+                )
+
+        return tf.data.Dataset.from_generator(
+            gen,
+            (
+                {"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32},
+                {"start_position": tf.int64, "end_position": tf.int64, "cls_index": tf.int64, "p_mask": tf.int32},
+            ),
+            (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "token_type_ids": tf.TensorShape([None]),
+                },
+                {
+                    "start_position": tf.TensorShape([]),
+                    "end_position": tf.TensorShape([]),
+                    "cls_index": tf.TensorShape([]),
+                    "p_mask": tf.TensorShape([None]),
+                },
+            ),
+        )
 
     return features
 
@@ -305,31 +359,32 @@ class SquadProcessor(DataProcessor):
     Processor for the SQuAD data set.
     Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
     """
+
     train_file = None
     dev_file = None
 
     def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
         if not evaluate:
-            answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8')
-            answer_start = tensor_dict['answers']['answer_start'][0].numpy()
+            answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
+            answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
             answers = []
         else:
-            answers = [{
-                "answer_start": start.numpy(), 
-                "text": text.numpy().decode('utf-8')
-            } for start, text in zip(tensor_dict['answers']["answer_start"], tensor_dict['answers']["text"])]
+            answers = [
+                {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
+                for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
+            ]
 
             answer = None
             answer_start = None
 
         return SquadExample(
-            qas_id=tensor_dict['id'].numpy().decode("utf-8"),
-            question_text=tensor_dict['question'].numpy().decode('utf-8'),
-            context_text=tensor_dict['context'].numpy().decode('utf-8'),
+            qas_id=tensor_dict["id"].numpy().decode("utf-8"),
+            question_text=tensor_dict["question"].numpy().decode("utf-8"),
+            context_text=tensor_dict["context"].numpy().decode("utf-8"),
             answer_text=answer,
             start_position_character=answer_start,
-            title=tensor_dict['title'].numpy().decode('utf-8'),
-            answers=answers
+            title=tensor_dict["title"].numpy().decode("utf-8"),
+            answers=answers,
         )
 
     def get_examples_from_dataset(self, dataset, evaluate=False):
@@ -359,7 +414,7 @@ class SquadProcessor(DataProcessor):
 
         examples = []
         for tensor_dict in tqdm(dataset):
-            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) 
+            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
 
         return examples
 
@@ -373,10 +428,15 @@ class SquadProcessor(DataProcessor):
                 which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
 
         """
+        if data_dir is None:
+            data_dir = ""
+
         if self.train_file is None:
             raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
 
-        with open(os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding='utf-8') as reader:
+        with open(
+            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
             input_data = json.load(reader)["data"]
         return self._create_examples(input_data, "train")
 
@@ -389,10 +449,15 @@ class SquadProcessor(DataProcessor):
             filename: None by default, specify this if the evaluation file has a different name than the original one
                 which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
         """
+        if data_dir is None:
+            data_dir = ""
+
         if self.dev_file is None:
             raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
-        
-        with open(os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding='utf-8') as reader:
+
+        with open(
+            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
             input_data = json.load(reader)["data"]
         return self._create_examples(input_data, "dev")
 
@@ -400,7 +465,7 @@ class SquadProcessor(DataProcessor):
         is_training = set_type == "train"
         examples = []
         for entry in tqdm(input_data):
-            title = entry['title']
+            title = entry["title"]
             for paragraph in entry["paragraphs"]:
                 context_text = paragraph["context"]
                 for qa in paragraph["qas"]:
@@ -409,7 +474,7 @@ class SquadProcessor(DataProcessor):
                     start_position_character = None
                     answer_text = None
                     answers = []
-                    
+
                     if "is_impossible" in qa:
                         is_impossible = qa["is_impossible"]
                     else:
@@ -418,8 +483,8 @@ class SquadProcessor(DataProcessor):
                     if not is_impossible:
                         if is_training:
                             answer = qa["answers"][0]
-                            answer_text = answer['text']
-                            start_position_character = answer['answer_start']
+                            answer_text = answer["text"]
+                            start_position_character = answer["answer_start"]
                         else:
                             answers = qa["answers"]
 
@@ -431,12 +496,13 @@ class SquadProcessor(DataProcessor):
                         start_position_character=start_position_character,
                         title=title,
                         is_impossible=is_impossible,
-                        answers=answers
+                        answers=answers,
                     )
 
                     examples.append(example)
         return examples
 
+
 class SquadV1Processor(SquadProcessor):
     train_file = "train-v1.1.json"
     dev_file = "dev-v1.1.json"
@@ -445,7 +511,7 @@ class SquadV1Processor(SquadProcessor):
 class SquadV2Processor(SquadProcessor):
     train_file = "train-v2.0.json"
     dev_file = "dev-v2.0.json"
-    
+
 
 class SquadExample(object):
     """
@@ -462,21 +528,23 @@ class SquadExample(object):
         is_impossible: False by default, set to True if the example has no possible answer.
     """
 
-    def __init__(self,
-                 qas_id,
-                 question_text,
-                 context_text,
-                 answer_text,
-                 start_position_character,
-                 title,
-                 answers=[],
-                 is_impossible=False):
+    def __init__(
+        self,
+        qas_id,
+        question_text,
+        context_text,
+        answer_text,
+        start_position_character,
+        title,
+        answers=[],
+        is_impossible=False,
+    ):
         self.qas_id = qas_id
         self.question_text = question_text
         self.context_text = context_text
         self.answer_text = answer_text
         self.title = title
-        self.is_impossible = is_impossible 
+        self.is_impossible = is_impossible
         self.answers = answers
 
         self.start_position, self.end_position = 0, 0
@@ -503,7 +571,9 @@ class SquadExample(object):
         # Start end end positions only has a value during evaluation.
         if start_position_character is not None and not is_impossible:
             self.start_position = char_to_word_offset[start_position_character]
-            self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1]
+            self.end_position = char_to_word_offset[
+                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
+            ]
 
 
 class SquadFeatures(object):
@@ -531,24 +601,23 @@ class SquadFeatures(object):
         end_position: end of the answer token index 
     """
 
-    def __init__(self,
-                 input_ids,
-                 attention_mask,
-                 token_type_ids,
-                 cls_index,
-                 p_mask,
-                 
-                 example_index,
-                 unique_id,
-                 paragraph_len,
-                 token_is_max_context,
-                 tokens,
-                 token_to_orig_map,
-
-                 start_position,
-                 end_position
-        ):
-        self.input_ids = input_ids 
+    def __init__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        cls_index,
+        p_mask,
+        example_index,
+        unique_id,
+        paragraph_len,
+        token_is_max_context,
+        tokens,
+        token_to_orig_map,
+        start_position,
+        end_position,
+    ):
+        self.input_ids = input_ids
         self.attention_mask = attention_mask
         self.token_type_ids = token_type_ids
         self.cls_index = cls_index
@@ -574,12 +643,13 @@ class SquadResult(object):
         start_logits: The logits corresponding to the start of the answer
         end_logits: The logits corresponding to the end of the answer
     """
+
     def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
         self.start_logits = start_logits
         self.end_logits = end_logits
         self.unique_id = unique_id
-        
+
         if start_top_index:
             self.start_top_index = start_top_index
             self.end_top_index = end_top_index
-            self.cls_logits = cls_logits
\ No newline at end of file
+            self.cls_logits = cls_logits
diff --git a/transformers/data/processors/utils.py b/transformers/data/processors/utils.py
index 07bdf3150c..ee234e6e90 100644
--- a/transformers/data/processors/utils.py
+++ b/transformers/data/processors/utils.py
@@ -18,6 +18,11 @@ import csv
 import sys
 import copy
 import json
+import logging
+
+from ...file_utils import is_tf_available, is_torch_available
+
+logger = logging.getLogger(__name__)
 
 class InputExample(object):
     """
@@ -64,7 +69,7 @@ class InputFeatures(object):
         label: Label corresponding to the input
     """
 
-    def __init__(self, input_ids, attention_mask, token_type_ids, label):
+    def __init__(self, input_ids, attention_mask=None, token_type_ids=None, label=None):
         self.input_ids = input_ids
         self.attention_mask = attention_mask
         self.token_type_ids = token_type_ids
@@ -86,34 +91,6 @@ class InputFeatures(object):
 class DataProcessor(object):
     """Base class for data converters for sequence classification data sets."""
 
-    def get_example_from_tensor_dict(self, tensor_dict):
-        """Gets an example from a dict with tensorflow tensors
-
-        Args:
-            tensor_dict: Keys and values should match the corresponding Glue
-                tensorflow_dataset examples.
-        """
-        raise NotImplementedError()
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-    def tfds_map(self, example):
-        """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. 
-        This method converts examples to the correct format."""
-        if len(self.get_labels()) > 1:
-            example.label = self.get_labels()[int(example.label)]
-        return example
-
     @classmethod
     def _read_tsv(cls, input_file, quotechar=None):
         """Reads a tab separated value file."""
@@ -125,3 +102,215 @@ class DataProcessor(object):
                     line = list(unicode(cell, 'utf-8') for cell in line)
                 lines.append(line)
             return lines
+
+
+class SingleSentenceClassificationProcessor(DataProcessor):
+    """ Generic processor for a single sentence classification data set."""
+    def __init__(self, labels=None, examples=None, mode='classification', verbose=False):
+        self.labels = [] if labels is None else labels
+        self.examples = [] if examples is None else examples
+        self.mode = mode
+        self.verbose = verbose
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            return SingleSentenceClassificationProcessor(labels=self.labels,
+                                                         examples=self.examples[idx])
+        return self.examples[idx]
+
+    @classmethod
+    def create_from_csv(cls, file_name, split_name='', column_label=0, column_text=1,
+                        column_id=None, skip_first_row=False, **kwargs):
+        processor = cls(**kwargs)
+        processor.add_examples_from_csv(file_name,
+                                        split_name=split_name,
+                                        column_label=column_label,
+                                        column_text=column_text,
+                                        column_id=column_id,
+                                        skip_first_row=skip_first_row,
+                                        overwrite_labels=True,
+                                        overwrite_examples=True)
+        return processor
+
+    @classmethod
+    def create_from_examples(cls, texts_or_text_and_labels, labels=None, **kwargs):
+        processor = cls(**kwargs)
+        processor.add_examples(texts_or_text_and_labels, labels=labels)
+        return processor
+
+    def add_examples_from_csv(self, file_name, split_name='', column_label=0, column_text=1, column_id=None,
+                              skip_first_row=False, overwrite_labels=False, overwrite_examples=False):
+        lines = self._read_tsv(file_name)
+        if skip_first_row:
+            lines = lines[1:]
+        texts = []
+        labels = []
+        ids = []
+        for (i, line) in enumerate(lines):
+            texts.append(line[column_text])
+            labels.append(line[column_label])
+            if column_id is not None:
+                ids.append(line[column_id])
+            else:
+                guid = "%s-%s" % (split_name, i) if split_name else "%s" % i
+                ids.append(guid)
+
+        return self.add_examples(texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples)
+
+    def add_examples(self, texts_or_text_and_labels, labels=None, ids=None,
+                     overwrite_labels=False, overwrite_examples=False):
+        assert labels is None or len(texts_or_text_and_labels) == len(labels)
+        assert ids is None or len(texts_or_text_and_labels) == len(ids)
+        if ids is None:
+            ids = [None] * len(texts_or_text_and_labels)
+        if labels is None:
+            labels = [None] * len(texts_or_text_and_labels)
+        examples = []
+        added_labels = set()
+        for (text_or_text_and_label, label, guid) in zip(texts_or_text_and_labels, labels, ids):
+            if isinstance(text_or_text_and_label, (tuple, list)) and label is None:
+                text, label = text_or_text_and_label
+            else:
+                text = text_or_text_and_label
+            added_labels.add(label)
+            examples.append(InputExample(guid=guid, text_a=text, text_b=None, label=label))
+
+        # Update examples
+        if overwrite_examples:
+            self.examples = examples
+        else:
+            self.examples.extend(examples)
+
+        # Update labels
+        if overwrite_labels:
+            self.labels = list(added_labels)
+        else:
+            self.labels = list(set(self.labels).union(added_labels))
+
+        return self.examples
+
+    def get_features(self,
+                     tokenizer,
+                     max_length=None,
+                     pad_on_left=False,
+                     pad_token=0,
+                     mask_padding_with_zero=True,
+                     return_tensors=None):
+        """
+        Convert examples in a list of ``InputFeatures``
+
+        Args:
+            tokenizer: Instance of a tokenizer that will tokenize the examples
+            max_length: Maximum example length
+            task: GLUE task
+            label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
+            output_mode: String indicating the output mode. Either ``regression`` or ``classification``
+            pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
+            pad_token: Padding token
+            mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
+                and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
+                actual values)
+
+        Returns:
+            If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
+            containing the task-specific features. If the input is a list of ``InputExamples``, will return
+            a list of task-specific ``InputFeatures`` which can be fed to the model.
+
+        """
+        if max_length is None:
+            max_length = tokenizer.max_len
+
+        label_map = {label: i for i, label in enumerate(self.labels)}
+
+        all_input_ids = []
+        for (ex_index, example) in enumerate(self.examples):
+            if ex_index % 10000 == 0:
+                logger.info("Tokenizing example %d", ex_index)
+
+            input_ids = tokenizer.encode(
+                example.text_a,
+                add_special_tokens=True,
+                max_length=min(max_length, tokenizer.max_len),
+            )
+            all_input_ids.append(input_ids)
+
+        batch_length = max(len(input_ids) for input_ids in all_input_ids)
+
+        features = []
+        for (ex_index, (input_ids, example)) in enumerate(zip(all_input_ids, self.examples)):
+            if ex_index % 10000 == 0:
+                logger.info("Writing example %d", ex_index)
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding_length = batch_length - len(input_ids)
+            if pad_on_left:
+                input_ids = ([pad_token] * padding_length) + input_ids
+                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
+            else:
+                input_ids = input_ids + ([pad_token] * padding_length)
+                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+
+            assert len(input_ids) == batch_length, "Error with input length {} vs {}".format(len(input_ids), batch_length)
+            assert len(attention_mask) == batch_length, "Error with input length {} vs {}".format(len(attention_mask), batch_length)
+
+            if self.mode == "classification":
+                label = label_map[example.label]
+            elif self.mode == "regression":
+                label = float(example.label)
+            else:
+                raise ValueError(self.mode)
+
+            if ex_index < 5 and self.verbose:
+                logger.info("*** Example ***")
+                logger.info("guid: %s" % (example.guid))
+                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
+                logger.info("label: %s (id = %d)" % (example.label, label))
+
+            features.append(
+                    InputFeatures(input_ids=input_ids,
+                                  attention_mask=attention_mask,
+                                  label=label))
+
+        if return_tensors is None:
+            return features
+        elif return_tensors == 'tf':
+            if not is_tf_available():
+                raise ImportError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported")
+            import tensorflow as tf
+            def gen():
+                for ex in features:
+                    yield  ({'input_ids': ex.input_ids,
+                            'attention_mask': ex.attention_mask},
+                            ex.label)
+
+            dataset = tf.data.Dataset.from_generator(gen,
+                    ({'input_ids': tf.int32,
+                    'attention_mask': tf.int32},
+                    tf.int64),
+                    ({'input_ids': tf.TensorShape([None]),
+                    'attention_mask': tf.TensorShape([None])},
+                    tf.TensorShape([])))
+            return dataset
+        elif return_tensors == 'pt':
+            if not is_torch_available():
+                raise ImportError("return_tensors set to 'pt' but PyTorch can't be imported")
+            import torch
+            from torch.utils.data import TensorDataset
+            all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+            all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+            if self.mode == "classification":
+                all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+            elif self.mode == "regression":
+                all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+
+            dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels)
+            return dataset
+        else:
+            raise ValueError("return_tensors should be one of 'tf' or 'pt'")
diff --git a/transformers/file_utils.py b/transformers/file_utils.py
index 24abd60781..ec925c6160 100644
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -10,10 +10,9 @@ import json
 import logging
 import os
 import six
-import shutil
 import tempfile
 import fnmatch
-from functools import wraps
+from functools import partial, wraps
 from hashlib import sha256
 from io import open
 
@@ -21,26 +20,38 @@ import boto3
 from botocore.config import Config
 from botocore.exceptions import ClientError
 import requests
-from tqdm import tqdm
+from tqdm.auto import tqdm
 from contextlib import contextmanager
+from . import __version__
+
+from filelock import FileLock
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 try:
-    import tensorflow as tf
-    assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2
-    _tf_available = True  # pylint: disable=invalid-name
-    logger.info("TensorFlow version {} available.".format(tf.__version__))
-except (ImportError, AssertionError):
-    _tf_available = False  # pylint: disable=invalid-name
-
-try:
-    import torch
-    _torch_available = True  # pylint: disable=invalid-name
-    logger.info("PyTorch version {} available.".format(torch.__version__))
+    os.environ.setdefault('USE_TORCH', 'YES')
+    if os.environ['USE_TORCH'].upper() in ('1', 'ON', 'YES'):
+        import torch
+        _torch_available = True  # pylint: disable=invalid-name
+        logger.info("PyTorch version {} available.".format(torch.__version__))
+    else:
+        logger.info("USE_TORCH override through env variable, disabling PyTorch")
+        _torch_available = False
 except ImportError:
     _torch_available = False  # pylint: disable=invalid-name
 
+try:
+    os.environ.setdefault('USE_TF', 'YES')
+    if os.environ['USE_TF'].upper() in ('1', 'ON', 'YES'):
+        import tensorflow as tf
+        assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2
+        _tf_available = True  # pylint: disable=invalid-name
+        logger.info("TensorFlow version {} available.".format(tf.__version__))
+    else:
+        logger.info("USE_TF override through env variable, disabling Tensorflow")
+        _tf_available = False
+except (ImportError, AssertionError):
+    _tf_available = False  # pylint: disable=invalid-name
 
 try:
     from torch.hub import _get_torch_home
@@ -72,11 +83,20 @@ WEIGHTS_NAME = "pytorch_model.bin"
 TF2_WEIGHTS_NAME = 'tf_model.h5'
 TF_WEIGHTS_NAME = 'model.ckpt'
 CONFIG_NAME = "config.json"
+MODEL_CARD_NAME = "modelcard.json"
+
+DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
+
+S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
+CLOUDFRONT_DISTRIB_PREFIX = "https://d2ws9o8vfrpkyk.cloudfront.net"
+
 
 def is_torch_available():
     return _torch_available
 
 def is_tf_available():
+
     return _tf_available
 
 if not six.PY2:
@@ -103,12 +123,25 @@ else:
             return fn
         return docstring_decorator
 
+
+def is_remote_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ('http', 'https', 's3')
+
+def hf_bucket_url(identifier, postfix=None, cdn=False):
+    endpoint = CLOUDFRONT_DISTRIB_PREFIX if cdn else S3_BUCKET_PREFIX
+    if postfix is None:
+        return "/".join((endpoint, identifier))
+    else:
+        return "/".join((endpoint, identifier, postfix))
+
+
 def url_to_filename(url, etag=None):
     """
     Convert `url` into a hashed filename in a repeatable way.
     If `etag` is specified, append its hash to the url's, delimited
     by a period.
-    If the url ends with .h5 (Keras HDF5 weights) ands '.h5' to the name
+    If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name
     so that TF 2.0 can identify it as a HDF5 file
     (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
     """
@@ -153,7 +186,7 @@ def filename_to_url(filename, cache_dir=None):
     return url, etag
 
 
-def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False):
+def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, user_agent=None):
     """
     Given something that might be a URL (or might be a local path),
     determine which. If it's a URL, download the file and cache it, and
@@ -163,6 +196,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
         cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
         force_download: if True, re-dowload the file even if it's already cached in the cache dir.
         resume_download: if True, resume the download if incompletly recieved file is found.
+        user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
     """
     if cache_dir is None:
         cache_dir = TRANSFORMERS_CACHE
@@ -171,17 +205,15 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
         cache_dir = str(cache_dir)
 
-    parsed = urlparse(url_or_filename)
-
-    if parsed.scheme in ('http', 'https', 's3'):
+    if is_remote_url(url_or_filename):
         # URL, so get it from the cache (downloading if necessary)
         return get_from_cache(url_or_filename, cache_dir=cache_dir,
             force_download=force_download, proxies=proxies,
-            resume_download=resume_download)
+            resume_download=resume_download, user_agent=user_agent)
     elif os.path.exists(url_or_filename):
         # File, and it exists.
         return url_or_filename
-    elif parsed.scheme == '':
+    elif urlparse(url_or_filename).scheme == '':
         # File, but it doesn't exist.
         raise EnvironmentError("file {} not found".format(url_or_filename))
     else:
@@ -238,14 +270,26 @@ def s3_get(url, temp_file, proxies=None):
     s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
 
 
-def http_get(url, temp_file, proxies=None, resume_size=0):
-    headers={'Range':'bytes=%d-'%(resume_size,)} if resume_size > 0 else None
+def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
+    ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
+    if isinstance(user_agent, dict):
+        ua += "; " + "; ".join(
+            "{}/{}".format(k, v) for k, v in user_agent.items()
+        )
+    elif isinstance(user_agent, six.string_types):
+        ua += "; "+ user_agent
+    headers = {
+        "user-agent": ua
+    }
+    if resume_size > 0:
+        headers['Range'] = 'bytes=%d-' % (resume_size,)
     response = requests.get(url, stream=True, proxies=proxies, headers=headers)
     if response.status_code == 416:  # Range not satisfiable
         return
     content_length = response.headers.get('Content-Length')
     total = resume_size + int(content_length) if content_length is not None else None
-    progress = tqdm(unit="B", total=total, initial=resume_size)
+    progress = tqdm(unit="B", unit_scale=True, total=total, initial=resume_size,
+                    desc="Downloading", disable=bool(logger.level<=logging.INFO))
     for chunk in response.iter_content(chunk_size=1024):
         if chunk: # filter out keep-alive new chunks
             progress.update(len(chunk))
@@ -253,7 +297,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0):
     progress.close()
 
 
-def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False):
+def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False, user_agent=None):
     """
     Given a URL, look for the corresponding dataset in the local cache.
     If it's not there, download it. Then return the path to the cached file.
@@ -291,59 +335,60 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag
     # If we don't have a connection (etag is None) and can't identify the file
     # try to get the last downloaded one
     if not os.path.exists(cache_path) and etag is None:
-        matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')
-        matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))
+        matching_files = [
+            file
+            for file in fnmatch.filter(os.listdir(cache_dir), filename + '.*')
+            if not file.endswith('.json') and not file.endswith('.lock')
+        ]
         if matching_files:
             cache_path = os.path.join(cache_dir, matching_files[-1])
 
-    if resume_download:
-        incomplete_path = cache_path + '.incomplete'
-        @contextmanager
-        def _resumable_file_manager():
-            with open(incomplete_path,'a+b') as f:
-                yield f
-            os.remove(incomplete_path)
-        temp_file_manager = _resumable_file_manager
-        if os.path.exists(incomplete_path):
-            resume_size = os.stat(incomplete_path).st_size
-        else:
-            resume_size = 0
-    else:
-        temp_file_manager = tempfile.NamedTemporaryFile
-        resume_size = 0
+    # Prevent parallel downloads of the same file with a lock.
+    lock_path = cache_path + '.lock'
+    with FileLock(lock_path):
 
-    if not os.path.exists(cache_path) or force_download:
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise you get corrupt cache entries if the download gets interrupted.
-        with temp_file_manager() as temp_file:
-            logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
-
-            # GET file object
-            if url.startswith("s3://"):
-                if resume_download:
-                    logger.warn('Warning: resumable downloads are not implemented for "s3://" urls')
-                s3_get(url, temp_file, proxies=proxies)
+        if resume_download:
+            incomplete_path = cache_path + '.incomplete'
+            @contextmanager
+            def _resumable_file_manager():
+                with open(incomplete_path,'a+b') as f:
+                    yield f
+            temp_file_manager = _resumable_file_manager
+            if os.path.exists(incomplete_path):
+                resume_size = os.stat(incomplete_path).st_size
             else:
-                http_get(url, temp_file, proxies=proxies, resume_size=resume_size)
+                resume_size = 0
+        else:
+            temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False)
+            resume_size = 0
 
-            # we are copying the file before closing it, so flush to avoid truncation
-            temp_file.flush()
-            # shutil.copyfileobj() starts at the current position, so go to the start
-            temp_file.seek(0)
+        if etag is not None and (not os.path.exists(cache_path) or force_download):
+            # Download to temporary file, then copy to cache dir once finished.
+            # Otherwise you get corrupt cache entries if the download gets interrupted.
+            with temp_file_manager() as temp_file:
+                logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
 
-            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
-            with open(cache_path, 'wb') as cache_file:
-                shutil.copyfileobj(temp_file, cache_file)
+                # GET file object
+                if url.startswith("s3://"):
+                    if resume_download:
+                        logger.warn('Warning: resumable downloads are not implemented for "s3://" urls')
+                    s3_get(url, temp_file, proxies=proxies)
+                else:
+                    http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
 
-            logger.info("creating metadata file for %s", cache_path)
-            meta = {'url': url, 'etag': etag}
-            meta_path = cache_path + '.json'
-            with open(meta_path, 'w') as meta_file:
-                output_string = json.dumps(meta)
-                if sys.version_info[0] == 2 and isinstance(output_string, str):
-                    output_string = unicode(output_string, 'utf-8')  # The beauty of python 2
-                meta_file.write(output_string)
+                # we are copying the file before closing it, so flush to avoid truncation
+                temp_file.flush()
 
-            logger.info("removing temp file %s", temp_file.name)
+                logger.info("storing %s in cache at %s", url, cache_path)
+                os.rename(temp_file.name, cache_path)
+
+                logger.info("creating metadata file for %s", cache_path)
+                meta = {'url': url, 'etag': etag}
+                meta_path = cache_path + '.json'
+                with open(meta_path, 'w') as meta_file:
+                    output_string = json.dumps(meta)
+                    if sys.version_info[0] == 2 and isinstance(output_string, str):
+                        output_string = unicode(output_string, 'utf-8')  # The beauty of python 2
+                    meta_file.write(output_string)
 
     return cache_path
diff --git a/transformers/hf_api.py b/transformers/hf_api.py
index 3bbb6c567a..170732339a 100644
--- a/transformers/hf_api.py
+++ b/transformers/hf_api.py
@@ -131,8 +131,9 @@ class HfApi:
         # the client still has to specify it when uploading the file.
         with open(filepath, "rb") as f:
             pf = TqdmProgressFileReader(f)
+            data = f if pf.total_size > 0 else ""
 
-            r = requests.put(urls.write, data=f, headers={
+            r = requests.put(urls.write, data=data, headers={
                 "content-type": urls.type,
             })
             r.raise_for_status()
diff --git a/transformers/modelcard.py b/transformers/modelcard.py
new file mode 100644
index 0000000000..4a879235ae
--- /dev/null
+++ b/transformers/modelcard.py
@@ -0,0 +1,229 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Configuration base class and utilities."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import copy
+import json
+import logging
+import os
+from io import open
+
+from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, WEIGHTS_NAME, TF2_WEIGHTS_NAME, \
+                        cached_path, is_remote_url, hf_bucket_url
+
+
+logger = logging.getLogger(__name__)
+
+
+class ModelCard(object):
+    r""" Model Card class.
+        Store model card as well as methods for loading/downloading/saving model cards.
+
+        Please read the following paper for details and explanation on the sections:
+            "Model Cards for Model Reporting"
+                by Margaret Mitchell, Simone Wu,
+                Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
+                Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards.
+            Link: https://arxiv.org/abs/1810.03993
+
+        Note:
+            A model card can be loaded and saved to disk.
+
+        Parameters:
+    """
+    def __init__(self, **kwargs):
+        # Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers)
+        self.model_details = kwargs.pop('model_details', {})
+        self.intended_use = kwargs.pop('intended_use', {})
+        self.factors = kwargs.pop('factors', {})
+        self.metrics = kwargs.pop('metrics', {})
+        self.evaluation_data = kwargs.pop('evaluation_data', {})
+        self.training_data = kwargs.pop('training_data', {})
+        self.quantitative_analyses = kwargs.pop('quantitative_analyses', {})
+        self.ethical_considerations = kwargs.pop('ethical_considerations', {})
+        self.caveats_and_recommendations = kwargs.pop('caveats_and_recommendations', {})
+
+        # Open additional attributes
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error("Can't set {} with value {} for {}".format(key, value, self))
+                raise err
+
+    def save_pretrained(self, save_directory_or_file):
+        """ Save a model card object to the directory or file `save_directory_or_file`.
+        """
+        if os.path.isdir(save_directory_or_file):
+            # If we save using the predefined names, we can load using `from_pretrained`
+            output_model_card_file = os.path.join(save_directory_or_file, MODEL_CARD_NAME)
+        else:
+            output_model_card_file = save_directory_or_file
+
+        self.to_json_file(output_model_card_file)
+        logger.info("Model card saved in {}".format(output_model_card_file))
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r""" Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
+
+        Parameters:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                - a path to a `directory` containing a mode card file saved using the :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/modelcard.json``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                card should be cached if the standard cache should not be used.
+
+            kwargs: (`optional`) dict: key/value pairs with which to update the ModelCard object after loading.
+
+                - The values in kwargs of any keys which are model card attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            find_from_standard_name: (`optional`) boolean, default True:
+                If the pretrained_model_name_or_path ends with our standard model or config filenames, replace them with our standard modelcard filename.
+                Can be used to directly feed a model/config url and access the colocated modelcard.
+
+            return_unused_kwargs: (`optional`) bool:
+
+                - If False, then this function returns just the final model card object.
+                - If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of kwargs which has not been used to update `ModelCard` and is otherwise ignored.
+
+        Examples::
+
+            modelcard = ModelCard.from_pretrained('bert-base-uncased')    # Download model card from S3 and cache.
+            modelcard = ModelCard.from_pretrained('./test/saved_model/')  # E.g. model card was saved using `save_pretrained('./test/saved_model/')`
+            modelcard = ModelCard.from_pretrained('./test/saved_model/modelcard.json')
+            modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+
+        """
+        cache_dir = kwargs.pop('cache_dir', None)
+        proxies = kwargs.pop('proxies', None)
+        find_from_standard_name = kwargs.pop('find_from_standard_name', True)
+        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
+
+        if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
+            # For simplicity we use the same pretrained url than the configuration files
+            # but with a different suffix (modelcard.json). This suffix is replaced below.
+            model_card_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
+        elif os.path.isdir(pretrained_model_name_or_path):
+            model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME)
+        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+            model_card_file = pretrained_model_name_or_path
+        else:
+            model_card_file = hf_bucket_url(pretrained_model_name_or_path, postfix=MODEL_CARD_NAME)
+
+        if find_from_standard_name or pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
+            model_card_file = model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME)
+            model_card_file = model_card_file.replace(WEIGHTS_NAME, MODEL_CARD_NAME)
+            model_card_file = model_card_file.replace(TF2_WEIGHTS_NAME, MODEL_CARD_NAME)
+
+        try:
+            # Load from URL or cache if already cached
+            resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=True,
+                                               proxies=proxies, resume_download=False)
+            if resolved_model_card_file == model_card_file:
+                logger.info("loading model card file {}".format(model_card_file))
+            else:
+                logger.info("loading model card file {} from cache at {}".format(
+                    model_card_file, resolved_model_card_file))
+            # Load model card
+            modelcard = cls.from_json_file(resolved_model_card_file)
+
+        except EnvironmentError:
+            if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
+                logger.warning("Couldn't reach server at '{}' to download model card file.".format(
+                        model_card_file))
+            else:
+                logger.warning("Model name '{}' was not found in model name list ({}). " \
+                      "We assumed '{}' was a path or url to a model card file named {} or " \
+                      "a directory containing such a file but couldn't find any such file at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
+                        model_card_file, MODEL_CARD_NAME))
+            logger.warning("Creating an empty model card.")
+
+            # We fall back on creating an empty model card
+            modelcard = cls()
+
+        except json.JSONDecodeError:
+            logger.warning("Couldn't reach server at '{}' to download model card file or "
+                           "model card file is not a valid JSON file. "
+                           "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file))
+            logger.warning("Creating an empty model card.")
+
+            # We fall back on creating an empty model card
+            modelcard = cls()
+
+        # Update model card with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(modelcard, key):
+                setattr(modelcard, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info("Model card: %s", str(modelcard))
+        if return_unused_kwargs:
+            return modelcard, kwargs
+        else:
+            return modelcard
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `ModelCard` from a Python dictionary of parameters."""
+        return cls(**json_object)
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `ModelCard` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        dict_obj = json.loads(text)
+        return cls(**dict_obj)
+
+    def __eq__(self, other):
+        return self.__dict__ == other.__dict__
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py
index b63e43d73b..6b49efd378 100644
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -18,18 +18,31 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
-from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
-from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel
-from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel
-from .modeling_ctrl import CTRLModel, CTRLLMHeadModel
-from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel
-from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
-from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
-from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
-from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
-from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
-from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
-from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering
+from .configuration_auto import (AlbertConfig, BertConfig, CamembertConfig, CTRLConfig,
+                                 DistilBertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig,
+                                 TransfoXLConfig, XLMConfig, XLNetConfig, XLMRobertaConfig)
+
+from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, \
+    BertForTokenClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_ctrl import CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering, \
+    XLNetForTokenClassification, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering, \
+    XLM_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, \
+    RobertaForTokenClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, \
+    DistilBertForSequenceClassification, DistilBertForTokenClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, \
+    CamembertForMultipleChoice, CamembertForTokenClassification, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, \
+    AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_xlm_roberta import XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, \
+    XLMRobertaForMultipleChoice, XLMRobertaForTokenClassification, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_utils import PreTrainedModel, SequenceSummary
 
@@ -38,21 +51,42 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 
+ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
+    for pretrained_map in [
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ]
+    for key, value, in pretrained_map.items())
+
+
 class AutoModel(object):
     r"""
         :class:`~transformers.AutoModel` is a generic model class
         that will be instantiated as one of the base model classes of the library
         when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+        or the `AutoModel.from_config(config)` class methods.
 
         The `from_pretrained()` method takes care of returning the correct model class instance
         using pattern matching on the `pretrained_model_name_or_path` string.
 
         The base model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Model (T5 model)
             - contains `distilbert`: DistilBertModel (DistilBERT model)
             - contains `albert`: AlbertModel (ALBERT model)
             - contains `camembert`: CamembertModel (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaModel (XLM-RoBERTa model)
             - contains `roberta`: RobertaModel (RoBERTa model)
             - contains `bert`: BertModel (Bert model)
             - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
@@ -66,7 +100,56 @@ class AutoModel(object):
     """
     def __init__(self):
         raise EnvironmentError("AutoModel is designed to be instantiated "
-            "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` method.")
+            "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` or "
+            "`AutoModel.from_config(config)` methods.")
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                The model class to instantiate is selected based on the configuration class:
+                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
+                    - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model)
+                    - isInstance of `bert` configuration class: BertModel (Bert model)
+                    - isInstance of `openai-gpt` configuration class: OpenAIGPTModel (OpenAI GPT model)
+                    - isInstance of `gpt2` configuration class: GPT2Model (OpenAI GPT-2 model)
+                    - isInstance of `ctrl` configuration class: CTRLModel (Salesforce CTRL  model)
+                    - isInstance of `transfo-xl` configuration class: TransfoXLModel (Transformer-XL model)
+                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
+                    - isInstance of `xlm` configuration class: XLMModel (XLM model)
+
+        Examples::
+
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            model = AutoModel.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        if isinstance(config, DistilBertConfig):
+            return DistilBertModel(config)
+        elif isinstance(config, RobertaConfig):
+            return RobertaModel(config)
+        elif isinstance(config, BertConfig):
+            return BertModel(config)
+        elif isinstance(config, OpenAIGPTConfig):
+            return OpenAIGPTModel(config)
+        elif isinstance(config, GPT2Config):
+            return GPT2Model(config)
+        elif isinstance(config, TransfoXLConfig):
+            return TransfoXLModel(config)
+        elif isinstance(config, XLNetConfig):
+            return XLNetModel(config)
+        elif isinstance(config, XLMConfig):
+            return XLMModel(config)
+        elif isinstance(config, CTRLConfig):
+            return CTRLModel(config)
+        elif isinstance(config, AlbertConfig):
+            return AlbertModel(config)
+        elif isinstance(config, CamembertConfig):
+            return CamembertModel(config)
+        elif isinstance(config, XLMRobertaConfig):
+            return XLMRobertaModel(config)
+        raise ValueError("Unrecognized configuration class {}".format(config))
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -75,9 +158,11 @@ class AutoModel(object):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Model (T5 model)
             - contains `distilbert`: DistilBertModel (DistilBERT model)
             - contains `albert`: AlbertModel (ALBERT model)
             - contains `camembert`: CamembertModel (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaModel (XLM-RoBERTa model)
             - contains `roberta`: RobertaModel (RoBERTa model)
             - contains `bert`: BertModel (Bert model)
             - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
@@ -94,6 +179,7 @@ class AutoModel(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
@@ -146,12 +232,16 @@ class AutoModel(object):
             model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return T5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
             return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'albert' in pretrained_model_name_or_path:
             return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'camembert' in pretrained_model_name_or_path:
             return CamembertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
@@ -170,7 +260,7 @@ class AutoModel(object):
             return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
+                         "'xlm-roberta', 'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
 
 
 class AutoModelWithLMHead(object):
@@ -185,9 +275,11 @@ class AutoModelWithLMHead(object):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5ModelWithLMHead (T5 model)
             - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
             - contains `albert`: AlbertForMaskedLM (ALBERT model)
             - contains `camembert`: CamembertForMaskedLM (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaForMaskedLM (XLM-RoBERTa model)
             - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
             - contains `bert`: BertForMaskedLM (Bert model)
             - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
@@ -201,7 +293,52 @@ class AutoModelWithLMHead(object):
     """
     def __init__(self):
         raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
-            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or "
+            "`AutoModelWithLMHead.from_config(config)` methods.")
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                The model class to instantiate is selected based on the configuration class:
+                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
+                    - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model)
+                    - isInstance of `bert` configuration class: BertModel (Bert model)
+                    - isInstance of `openai-gpt` configuration class: OpenAIGPTModel (OpenAI GPT model)
+                    - isInstance of `gpt2` configuration class: GPT2Model (OpenAI GPT-2 model)
+                    - isInstance of `ctrl` configuration class: CTRLModel (Salesforce CTRL  model)
+                    - isInstance of `transfo-xl` configuration class: TransfoXLModel (Transformer-XL model)
+                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
+                    - isInstance of `xlm` configuration class: XLMModel (XLM model)
+
+        Examples::
+
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            model = AutoModelWithLMHead.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        if isinstance(config, DistilBertConfig):
+            return DistilBertForMaskedLM(config)
+        elif isinstance(config, RobertaConfig):
+            return RobertaForMaskedLM(config)
+        elif isinstance(config, BertConfig):
+            return BertForMaskedLM(config)
+        elif isinstance(config, OpenAIGPTConfig):
+            return OpenAIGPTLMHeadModel(config)
+        elif isinstance(config, GPT2Config):
+            return GPT2LMHeadModel(config)
+        elif isinstance(config, TransfoXLConfig):
+            return TransfoXLLMHeadModel(config)
+        elif isinstance(config, XLNetConfig):
+            return XLNetLMHeadModel(config)
+        elif isinstance(config, XLMConfig):
+            return XLMWithLMHeadModel(config)
+        elif isinstance(config, CTRLConfig):
+            return CTRLLMHeadModel(config)
+        elif isinstance(config, XLMRobertaConfig):
+            return XLMRobertaForMaskedLM(config)
+        raise ValueError("Unrecognized configuration class {}".format(config))
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -213,9 +350,11 @@ class AutoModelWithLMHead(object):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5ModelWithLMHead (T5 model)
             - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
             - contains `albert`: AlbertForMaskedLM (ALBERT model)
             - contains `camembert`: CamembertForMaskedLM (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaForMaskedLM (XLM-RoBERTa model)
             - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
             - contains `bert`: BertForMaskedLM (Bert model)
             - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
@@ -232,6 +371,7 @@ class AutoModelWithLMHead(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
@@ -283,12 +423,16 @@ class AutoModelWithLMHead(object):
             model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return T5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
             return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'albert' in pretrained_model_name_or_path:
             return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'camembert' in pretrained_model_name_or_path:
             return CamembertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
@@ -307,7 +451,7 @@ class AutoModelWithLMHead(object):
             return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
+                         "'xlm-roberta', 'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
 
 
 class AutoModelForSequenceClassification(object):
@@ -325,6 +469,7 @@ class AutoModelForSequenceClassification(object):
             - contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
             - contains `albert`: AlbertForSequenceClassification (ALBERT model)
             - contains `camembert`: CamembertForSequenceClassification (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaForSequenceClassification (XLM-RoBERTa model)
             - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
             - contains `bert`: BertForSequenceClassification (Bert model)
             - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
@@ -333,8 +478,45 @@ class AutoModelForSequenceClassification(object):
         This class cannot be instantiated using `__init__()` (throws an error).
     """
     def __init__(self):
-        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
-            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+        raise EnvironmentError("AutoModelForSequenceClassification is designed to be instantiated "
+            "using the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or "
+            "`AutoModelForSequenceClassification.from_config(config)` methods.")
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                The model class to instantiate is selected based on the configuration class:
+                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
+                    - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model)
+                    - isInstance of `bert` configuration class: BertModel (Bert model)
+                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
+                    - isInstance of `xlm` configuration class: XLMModel (XLM model)
+
+        Examples::
+
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            model = AutoModelForSequenceClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        if isinstance(config, AlbertConfig):
+            return AlbertForSequenceClassification(config)
+        elif isinstance(config, CamembertConfig):
+            return CamembertForSequenceClassification(config)
+        elif isinstance(config, DistilBertConfig):
+            return DistilBertForSequenceClassification(config)
+        elif isinstance(config, RobertaConfig):
+            return RobertaForSequenceClassification(config)
+        elif isinstance(config, BertConfig):
+            return BertForSequenceClassification(config)
+        elif isinstance(config, XLNetConfig):
+            return XLNetForSequenceClassification(config)
+        elif isinstance(config, XLMConfig):
+            return XLMForSequenceClassification(config)
+        elif isinstance(config, XLMRobertaConfig):
+            return XLMRobertaForSequenceClassification(config)
+        raise ValueError("Unrecognized configuration class {}".format(config))
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -349,6 +531,7 @@ class AutoModelForSequenceClassification(object):
             - contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
             - contains `albert`: AlbertForSequenceClassification (ALBERT model)
             - contains `camembert`: CamembertForSequenceClassification (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaForSequenceClassification (XLM-RoBERTa model)
             - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
             - contains `bert`: BertForSequenceClassification (Bert model)
             - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
@@ -361,6 +544,7 @@ class AutoModelForSequenceClassification(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
@@ -419,6 +603,8 @@ class AutoModelForSequenceClassification(object):
             return AlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'camembert' in pretrained_model_name_or_path:
             return CamembertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
@@ -429,7 +615,7 @@ class AutoModelForSequenceClassification(object):
             return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
+                         "'bert', 'xlnet', 'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
 
 
 class AutoModelForQuestionAnswering(object):
@@ -453,8 +639,38 @@ class AutoModelForQuestionAnswering(object):
         This class cannot be instantiated using `__init__()` (throws an error).
     """
     def __init__(self):
-        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
-            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+        raise EnvironmentError("AutoModelForQuestionAnswering is designed to be instantiated "
+            "using the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or "
+            "`AutoModelForQuestionAnswering.from_config(config)` methods.")
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                The model class to instantiate is selected based on the configuration class:
+                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
+                    - isInstance of `bert` configuration class: BertModel (Bert model)
+                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
+                    - isInstance of `xlm` configuration class: XLMModel (XLM model)
+
+        Examples::
+
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            model = AutoModelForSequenceClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        if isinstance(config, AlbertConfig):
+            return AlbertForQuestionAnswering(config)
+        elif isinstance(config, DistilBertConfig):
+            return DistilBertForQuestionAnswering(config)
+        elif isinstance(config, BertConfig):
+            return BertForQuestionAnswering(config)
+        elif isinstance(config, XLNetConfig):
+            return XLNetForQuestionAnswering(config)
+        elif isinstance(config, XLMConfig):
+            return XLMForQuestionAnswering(config)
+        raise ValueError("Unrecognized configuration class {}".format(config))
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -479,6 +695,7 @@ class AutoModelForQuestionAnswering(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
@@ -541,3 +758,130 @@ class AutoModelForQuestionAnswering(object):
 
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'xlnet', 'xlm', 'distilbert', 'albert'".format(pretrained_model_name_or_path))
+
+
+class AutoModelForTokenClassification:
+    def __init__(self):
+        raise EnvironmentError("AutoModelForTokenClassification is designed to be instantiated "
+                               "using the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
+                               "`AutoModelForTokenClassification.from_config(config)` methods.")
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+    
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                The model class to instantiate is selected based on the configuration class:
+                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
+                    - isInstance of `bert` configuration class: BertModel (Bert model)
+                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
+                    - isInstance of `camembert` configuration class: CamembertModel (Camembert model)
+                    - isInstance of `roberta` configuration class: RobertaModel (Roberta model)
+
+        Examples::
+    
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            model = AutoModelForTokenClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        if isinstance(config, CamembertConfig):
+            return CamembertForTokenClassification(config)
+        elif isinstance(config, DistilBertConfig):
+            return DistilBertForTokenClassification(config)
+        elif isinstance(config, BertConfig):
+            return BertForTokenClassification(config)
+        elif isinstance(config, XLNetConfig):
+            return XLNetForTokenClassification(config)
+        elif isinstance(config, RobertaConfig):
+            return RobertaForTokenClassification(config)
+        elif isinstance(config, XLMRobertaConfig):
+            return XLMRobertaForTokenClassification(config)
+        raise ValueError("Unrecognized configuration class {}".format(config))
+    
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the question answering model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForTokenClassification (DistilBERT model)
+            - contains `camembert`: CamembertForTokenClassification (Camembert model)
+            - contains `bert`: BertForTokenClassification (Bert model)
+            - contains `xlnet`: XLNetForTokenClassification (XLNet model)
+            - contains `roberta`: RobertaForTokenClassification (Roberta model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModelForTokenClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'camembert' in pretrained_model_name_or_path:
+            return CamembertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return RobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'xlnet', 'camembert', 'distilbert', 'xlm-roberta', 'roberta'".format(pretrained_model_name_or_path))
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index d84b0a1a7c..afeb9d8e21 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -48,6 +48,12 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
     'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
     'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
+    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
+    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
+    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
+    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
+    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
+    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
 }
 
 
@@ -1233,9 +1239,9 @@ class BertForQuestionAnswering(BertPreTrainedModel):
         question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
         input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
         input_ids = tokenizer.encode(input_text)
-        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] 
+        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
         start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)  
+        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
         print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
         # a nice puppet
 
diff --git a/transformers/modeling_ctrl.py b/transformers/modeling_ctrl.py
index 97bcb14434..af7f2c3a8b 100644
--- a/transformers/modeling_ctrl.py
+++ b/transformers/modeling_ctrl.py
@@ -268,7 +268,7 @@ class CTRLModel(CTRLPreTrainedModel):
 
         tokenizer = CTRLTokenizer.from_pretrained('ctrl')
         model = CTRLModel.from_pretrained('ctrl')
-        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
@@ -458,7 +458,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
         tokenizer = CTRLTokenizer.from_pretrained('ctrl')
         model = CTRLLMHeadModel.from_pretrained('ctrl')
 
-        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids, labels=input_ids)
         loss, logits = outputs[:2]
 
diff --git a/transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py
index 6faeafa15e..eb75efa90f 100644
--- a/transformers/modeling_distilbert.py
+++ b/transformers/modeling_distilbert.py
@@ -415,7 +415,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
 
         tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
         model = DistilBertModel.from_pretrained('distilbert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
@@ -511,7 +511,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
 
         tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
         model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids, masked_lm_labels=input_ids)
         loss, prediction_scores = outputs[:2]
 
@@ -581,7 +581,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
 
         tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
         model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids, labels=labels)
         loss, logits = outputs[:2]
@@ -656,7 +656,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
 
         tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
         model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         start_positions = torch.tensor([1])
         end_positions = torch.tensor([3])
         outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py
index d69a75cc75..ddfebdc393 100644
--- a/transformers/modeling_encoder_decoder.py
+++ b/transformers/modeling_encoder_decoder.py
@@ -61,12 +61,14 @@ class PreTrainedEncoderDecoder(nn.Module):
             encoder_pretrained_model_name_or_path: information necessary to initiate the encoder. Either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/encoder``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
             decoder_pretrained_model_name_or_path: information necessary to initiate the decoder. Either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/decoder``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
@@ -165,7 +167,39 @@ class PreTrainedEncoderDecoder(nn.Module):
 
         We save the encoder' and decoder's parameters in two separate directories.
         """
+
+        # If the root output directory does not exist, create it 
+        if not os.path.exists(save_directory):
+            os.mkdir(save_directory)
+
+        # Check whether the output directory is empty or not
+        sub_directories = [directory for directory in os.listdir(save_directory)
+            if os.path.isdir(os.path.join(save_directory, directory))]
+
+        if len(sub_directories) > 0:
+            if "encoder" in sub_directories and "decoder" in sub_directories:
+                print("WARNING: there is an older version of encoder-decoder saved in" +\
+                    " the output directory. The default behaviour is to overwrite them.")
+
+            # Empty the output directory
+            for directory_to_remove in sub_directories:
+                # Remove all files into the subdirectory
+                files_to_remove = os.listdir(os.path.join(save_directory, directory_to_remove))
+                for file_to_remove in files_to_remove:
+                    os.remove(os.path.join(save_directory, directory_to_remove, file_to_remove))
+                # Remove the subdirectory itself
+                os.rmdir(os.path.join(save_directory, directory_to_remove))
+
+            assert(len(os.listdir(save_directory)) == 0) # sanity check
+
+        # Create the "encoder" directory inside the output directory and save the encoder into it
+        if not os.path.exists(os.path.join(save_directory, "encoder")):
+            os.mkdir(os.path.join(save_directory, "encoder"))
         self.encoder.save_pretrained(os.path.join(save_directory, "encoder"))
+
+        # Create the "encoder" directory inside the output directory and save the decoder into it
+        if not os.path.exists(os.path.join(save_directory, "decoder")):
+            os.mkdir(os.path.join(save_directory, "decoder"))
         self.decoder.save_pretrained(os.path.join(save_directory, "decoder"))
 
     def forward(self, encoder_input_ids, decoder_input_ids, **kwargs):
@@ -236,7 +270,6 @@ class PreTrainedEncoderDecoder(nn.Module):
             }
         )
         decoder_kwargs["encoder_attention_mask"] = encoder_kwargs.get("attention_mask", None)
-
         return encoder_kwargs, decoder_kwargs
 
 
diff --git a/transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py
index 96fd1c0607..e0dc60177e 100644
--- a/transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -345,7 +345,7 @@ class GPT2Model(GPT2PreTrainedModel):
 
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = GPT2Model.from_pretrained('gpt2')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
@@ -523,7 +523,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = GPT2LMHeadModel.from_pretrained('gpt2')
 
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids, labels=input_ids)
         loss, logits = outputs[:2]
 
@@ -634,6 +634,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     """
     def __init__(self, config):
         super(GPT2DoubleHeadsModel, self).__init__(config)
+        config.num_labels = 1
         self.transformer = GPT2Model(config)
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         self.multiple_choice_head = SequenceSummary(config)
diff --git a/transformers/modeling_openai.py b/transformers/modeling_openai.py
index 4fe7ffee8b..4b726bcae1 100644
--- a/transformers/modeling_openai.py
+++ b/transformers/modeling_openai.py
@@ -349,7 +349,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 
         tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
         model = OpenAIGPTModel.from_pretrained('openai-gpt')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
@@ -491,7 +491,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
 
         tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
         model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids, labels=input_ids)
         loss, logits = outputs[:2]
 
@@ -590,6 +590,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     def __init__(self, config):
         super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
 
+        config.num_labels = 1
         self.transformer = OpenAIGPTModel(config)
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         self.multiple_choice_head = SequenceSummary(config)
diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py
index fc27353d37..5ee646e692 100644
--- a/transformers/modeling_roberta.py
+++ b/transformers/modeling_roberta.py
@@ -51,24 +51,44 @@ class RobertaEmbeddings(BertEmbeddings):
                                                 padding_idx=self.padding_idx)
 
     def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
-
-        seq_length = input_shape[1]
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
         if position_ids is None:
-            # Position numbers begin at padding_idx+1. Padding symbols are ignored.
-            # cf. fairseq's `utils.make_positions`
-            position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).expand(input_shape)
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = self.create_position_ids_from_input_ids(input_ids).to(input_ids.device)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
         return super(RobertaEmbeddings, self).forward(input_ids,
                                                       token_type_ids=token_type_ids,
                                                       position_ids=position_ids,
                                                       inputs_embeds=inputs_embeds)
 
+    def create_position_ids_from_input_ids(self, x):
+        """ Replace non-padding symbols with their position numbers. Position numbers begin at
+        padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
+        `utils.make_positions`.
+
+        :param torch.Tensor x:
+        :return torch.Tensor:
+        """
+        mask = x.ne(self.padding_idx).long()
+        incremental_indicies = torch.cumsum(mask, dim=1) * mask
+        return incremental_indicies + self.padding_idx
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """ We are provided embeddings directly. We cannot infer which are padded so just generate
+        sequential position ids.
+
+        :param torch.Tensor inputs_embeds:
+        :return torch.Tensor:
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(self.padding_idx+1, sequence_length+self.padding_idx+1, dtype=torch.long,
+                                    device=inputs_embeds.device)
+        return position_ids.unsqueeze(0).expand(input_shape)
+
 
 ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
     `RoBERTa: A Robustly Optimized BERT Pretraining Approach`_
@@ -168,7 +188,7 @@ class RobertaModel(BertModel):
 
         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
         model = RobertaModel.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
@@ -216,7 +236,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
 
         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
         model = RobertaForMaskedLM.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids, masked_lm_labels=input_ids)
         loss, prediction_scores = outputs[:2]
 
@@ -307,7 +327,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
 
         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
         model = RobertaForSequenceClassification.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids, labels=labels)
         loss, logits = outputs[:2]
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
new file mode 100644
index 0000000000..9baf69d02b
--- /dev/null
+++ b/transformers/modeling_t5.py
@@ -0,0 +1,886 @@
+# coding=utf-8
+# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch T5 model. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import math
+import os
+import sys
+import copy
+import itertools
+from io import open
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from .modeling_utils import PreTrainedModel, prune_linear_layer
+from .configuration_t5 import T5Config
+from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
+
+logger = logging.getLogger(__name__)
+
+####################################################
+# This dict contrains shortcut names and associated url
+# for the pretrained weights provided with the models
+####################################################
+T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
+    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
+    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
+    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
+    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
+}
+
+####################################################
+# This is a conversion method from TF 1.0 to PyTorch
+# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
+####################################################
+def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model.
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    tf_weights = {}
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        tf_weights[name] = array
+
+    for txt_name in names:
+        name = txt_name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
+            logger.info("Skipping {}".format("/".join(name)))
+            tf_weights.pop(txt_name, None)
+            continue
+        if '_slot_' in name[-1]:
+            logger.info("Skipping {}".format("/".join(name)))
+            tf_weights.pop(txt_name, None)
+            continue
+        pointer = model
+        array = tf_weights[txt_name]
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] in ['kernel', 'scale', 'embedding']:
+                pointer = getattr(pointer, 'weight')
+            # elif l[0] == 'scale':
+            #     pointer = getattr(pointer, 'weight')
+            # elif l[0] == 'output_bias' or l[0] == 'beta':
+            #     pointer = getattr(pointer, 'bias')
+            # elif l[0] == 'squad':
+            #     pointer = getattr(pointer, 'classifier')
+            else:
+                try:
+                    pointer = getattr(pointer, l[0])
+                except AttributeError:
+                    logger.info("Skipping {}".format("/".join(name)))
+                    continue
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if l[0] not in ['kernel', 'scale', 'embedding']:
+            pointer = getattr(pointer, 'weight')
+        if l[0] != 'embedding':
+            logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name))
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array.astype(np.float32))
+        tf_weights.pop(txt_name, None)
+
+    logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    # logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    return model
+
+
+####################################################
+# PyTorch Models are constructed by sub-classing
+# - torch.nn.Module for the layers and
+# - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
+####################################################
+
+class T5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """ Construct a layernorm module in the T5 style
+            No bias and no substraction of mean.
+        """
+        super(T5LayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        variance = x.pow(2).mean(-1, keepdim=True)
+        x = x / torch.sqrt(variance + self.variance_epsilon)
+        return self.weight * x
+
+
+class T5DenseReluDense(nn.Module):
+    def __init__(self, config):
+        super(T5DenseReluDense, self).__init__()
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        h = self.wi(hidden_states)
+        h = F.relu(h)
+        h = self.dropout(h)
+        h = self.wo(h)
+        return h
+
+
+class T5LayerFF(nn.Module):
+    def __init__(self, config):
+        super(T5LayerFF, self).__init__()
+        self.DenseReluDense = T5DenseReluDense(config)
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        norm_x = self.layer_norm(hidden_states)
+        y = self.DenseReluDense(norm_x)
+        layer_output = hidden_states + self.dropout(y)
+        return layer_output
+
+
+class T5Attention(nn.Module):
+    NEW_ID = itertools.count()
+
+    def __init__(self, config, has_relative_attention_bias=False):
+        super(T5Attention, self).__init__()
+        self.layer_id = next(T5Attention.NEW_ID)
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+
+        self.output_attentions = config.output_attentions
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.d_model = config.d_model
+        self.d_kv = config.d_kv
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.d_kv
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.n_heads, self.d_kv)
+        heads = set(heads) - self.pruned_heads
+        for head in heads:
+            head -= sum(1 if h < head else 0 for h in self.pruned_heads)
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        # Prune linear layers
+        self.q = prune_linear_layer(self.q, index)
+        self.k = prune_linear_layer(self.k, index)
+        self.v = prune_linear_layer(self.v, index)
+        self.o = prune_linear_layer(self.o, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.inner_dim = self.d_kv * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position,
+                                  bidirectional=True,
+                                  num_buckets=32,
+                                  max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention.
+        The relative position is defined as memory_position - query_position, i.e.
+        the distance in tokens from the attending position to the attended-to
+        position.  If bidirectional=False, then positive relative positions are
+        invalid.
+        We use smaller buckets for small absolute relative_position and larger buckets
+        for larger absolute relative_positions.  All relative positions >=max_distance
+        map to the same bucket.  All relative positions <=-max_distance map to the
+        same bucket.  This should allow for more graceful generalization to longer
+        sequences than the model has been trained on.
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32
+            values in the range [0, num_buckets)
+        """
+        ret = 0
+        n = -relative_position
+        if bidirectional:
+            num_buckets //= 2
+            ret += (n < 0).to(torch.long) * num_buckets  # mtf.to_int32(mtf.less(n, 0)) * num_buckets
+            n = torch.abs(n)
+        else:
+            n = torch.max(n, torch.zeros_like(n))
+        # now n is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = (n < max_exact)
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        val_if_large = max_exact + (
+            torch.log(n.float() / max_exact)
+            / math.log(max_distance / max_exact) * (num_buckets - max_exact)).to(torch.long)
+        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
+
+        ret += torch.where(is_small, n, val_if_large)
+        return ret
+
+    def compute_bias(self, qlen, klen):
+        """ Compute binned relative position bias """
+        context_position = torch.arange(qlen, dtype=torch.long)[:, None]
+        memory_position = torch.arange(klen, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position  # shape (qlen, klen)
+        rp_bucket = self._relative_position_bucket(relative_position,  # shape (qlen, klen)
+                                                   bidirectional=not self.is_decoder,
+                                                   num_buckets=self.relative_attention_num_buckets)
+        values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen)
+        return values
+
+    def forward(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = input.size()
+        if kv is None:
+            klen = qlen if cache is None else cache['slen'] + qlen
+        else:
+            klen = kv.size(1)
+
+        def shape(x):
+            """  projection """
+            return x.view(bs, -1, self.n_heads, self.d_kv).transpose(1, 2)
+
+        def unshape(x):
+            """  compute context """
+            return x.transpose(1, 2).contiguous().view(bs, -1, self.inner_dim)
+
+        q = shape(self.q(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        if kv is None:
+            k = shape(self.k(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(input))                                      # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = shape(self.k(k))                                          # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(v))                                          # (bs, n_heads, qlen, dim_per_head)
+
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = torch.cat([k_, k], dim=2)                             # (bs, n_heads, klen, dim_per_head)
+                    v = torch.cat([v_, v], dim=2)                             # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+            cache[self.layer_id] = (k, v)
+
+        # q = q / math.sqrt(dim_per_head)                                     # No scaling in T5
+        scores = torch.einsum('bnqd,bnkd->bnqk', q, k)                        # (bs, n_heads, qlen, klen)
+
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                raise ValueError("No position_bias provided and no weights to compute position_bias")
+            position_bias = self.compute_bias(qlen, klen)
+            if mask is not None:
+                position_bias = position_bias + mask                          # (bs, n_heads, qlen, klen)
+
+        scores += position_bias
+        weights = F.softmax(scores.float(), dim=-1).type_as(scores)           # (bs, n_heads, qlen, klen)
+        weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = torch.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)                                            # (bs, qlen, dim)
+
+        context = self.o(context)
+
+        outputs = (context,)
+        if self.output_attentions:
+            outputs = outputs + (weights,)
+        if self.has_relative_attention_bias:
+            outputs = outputs + (position_bias,)
+        return outputs
+
+
+class T5LayerSelfAttention(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super(T5LayerSelfAttention, self).__init__()
+        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None):
+        norm_x = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(norm_x,
+                                              mask=attention_mask,
+                                              position_bias=position_bias,
+                                              head_mask=head_mask)
+        y = attention_output[0]
+        layer_output = hidden_states + self.dropout(y)
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class T5LayerCrossAttention(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super(T5LayerCrossAttention, self).__init__()
+        self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None):
+        norm_x = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(norm_x,
+                                                mask=attention_mask,
+                                                kv=kv,
+                                                position_bias=position_bias,
+                                                head_mask=head_mask)
+        y = attention_output[0]
+        layer_output = hidden_states + self.dropout(y)
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class T5Block(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super(T5Block, self).__init__()
+        self.is_decoder = config.is_decoder
+        self.layer = nn.ModuleList()
+        self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
+        if self.is_decoder:
+            self.layer.append(T5LayerCrossAttention(config, has_relative_attention_bias=has_relative_attention_bias))
+            self.layer.append(T5LayerFF(config))
+        else:
+            self.layer.append(T5LayerFF(config))
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None,
+                encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
+                head_mask=None):
+        self_attention_outputs = self.layer[0](hidden_states,
+                                                attention_mask=attention_mask,
+                                                position_bias=position_bias,
+                                                head_mask=head_mask)
+        hidden_states = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # Keep self-attention outputs and relative position weights
+
+        if not self.is_decoder:
+            hidden_states = self.layer[1](hidden_states)
+        else:
+            cross_attention_outputs = self.layer[1](hidden_states,
+                                                    kv=encoder_hidden_states,
+                                                    attention_mask=encoder_attention_mask,
+                                                    position_bias=encoder_decoder_position_bias,
+                                                    head_mask=head_mask)
+            hidden_states = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:]  # Keep cross-attention outputs and relative position weights
+            hidden_states = self.layer[2](hidden_states)
+
+        outputs = (hidden_states,) + outputs  # add attentions if we output them
+        return outputs  # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+
+
+class T5PreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = T5Config
+    pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_t5
+    base_model_prefix = "transformer"
+
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {'decoder_input_ids': input_ids,
+                        'encoder_input_ids': input_ids,
+                        'decoder_attention_mask': input_mask}
+        return dummy_inputs
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, T5LayerNorm):
+            module.weight.data.fill_(factor*1.0)
+        elif isinstance(module, (T5Model, T5WithLMHeadModel)):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.shared.weight.data.normal_(mean=0.0, std=factor*1.0)
+        elif isinstance(module, T5DenseReluDense):
+            # Mesh TensorFlow FF initialization
+            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(mean=0.0, std=factor*((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, 'bias') and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor*((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, 'bias') and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5Attention):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            d_kv = self.config.d_kv
+            n_heads = self.config.num_heads
+            module.q.weight.data.normal_(mean=0.0, std=factor*((d_model * d_kv) ** -0.5))
+            module.k.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
+            module.v.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
+            module.o.weight.data.normal_(mean=0.0, std=factor*((n_heads * d_kv) ** -0.5))
+            if module.has_relative_attention_bias:
+                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor*((d_model) ** -0.5))
+
+
+class T5Stack(T5PreTrainedModel):
+    def __init__(self, config):
+        super(T5Stack, self).__init__(config)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.is_decoder = config.is_decoder
+
+        self.block = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0))
+                                    for i in range(config.num_layers)])
+        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+        self.init_weights()
+
+    def forward(self,
+                hidden_states,
+                attention_mask=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                head_mask=None):
+
+        batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1]
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, seq_length).to(hidden_states.device)
+        if self.is_decoder and encoder_attention_mask is None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = torch.ones(batch_size, encoder_seq_length).to(hidden_states.device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+        # Provided a padding mask of dimensions [batch_size, seq_length]
+        # - if the model is a decoder, apply a causal mask in addition to the padding mask
+        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if self.config.is_decoder:
+                seq_ids = torch.arange(seq_length, device=hidden_states.device)
+                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                causal_mask = causal_mask.to(attention_mask)
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -1e9 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+
+        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+        # extended_attention_mask = (extended_attention_mask == extended_attention_mask.transpose(-1, -2))
+
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
+
+        if self.is_decoder:
+            # If a 2D ou 3D attention mask is provided for the cross-attention
+            # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+            if encoder_attention_mask.dim() == 3:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+            if encoder_attention_mask.dim() == 2:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+
+            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+            # encoder_extended_attention_mask = (encoder_extended_attention_mask == encoder_extended_attention_mask.transpose(-1, -2))
+
+            encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_layers
+
+        all_hidden_states = ()
+        all_attentions = ()
+        position_bias = None
+        encoder_decoder_position_bias = None
+
+        hidden_states = self.dropout(hidden_states)
+        for i, layer_module in enumerate(self.block):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(hidden_states,
+                                         attention_mask=extended_attention_mask,
+                                         position_bias=position_bias,
+                                         encoder_hidden_states=encoder_hidden_states,
+                                         encoder_attention_mask=encoder_extended_attention_mask,
+                                         encoder_decoder_position_bias=encoder_decoder_position_bias,
+                                         head_mask=head_mask[i])
+            # layer_outputs is a tuple with:
+            # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+            hidden_states = layer_outputs[0]
+            if i == 0:
+                # We share the position biases between the layers - the first layer store them
+                # layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+                position_bias = layer_outputs[2 if self.output_attentions else 1]
+                if self.is_decoder:
+                    encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2]
+
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)  # We keep only self-attention weights for now
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        layer_output = self.dropout(hidden_states)
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+
+
+T5_START_DOCSTRING = r"""    The T5 model was proposed in
+    `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
+    by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
+    It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
+        https://arxiv.org/abs/1910.10683
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+T5_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+
+            (b) For single sequences:
+
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+
+            T5 is a model with relative position embeddings so you should be able to pad the inputs on
+            the right or the left.
+
+            Indices can be obtained using :class:`transformers.T5Tokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states"
+                      "without any specific head on top.",
+                      T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+class T5Model(T5PreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        model = T5Model.from_pretrained('t5-small')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids=input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config):
+        super(T5Model, self).__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        self.encoder = T5Stack(encoder_config)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        self.decoder = T5Stack(decoder_config)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(self, **kwargs):
+        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
+        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
+        # that apply to the model as whole.
+        # We let the specific kwargs override the common ones in case of conflict.
+        kwargs_common = dict((k, v) for k, v in kwargs.items()
+                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_encoder = kwargs_common.copy()
+        kwargs_decoder = kwargs_common.copy()
+        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+
+        # Encode if needed (training, first prediction pass)
+        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
+        encoder_attention_mask = kwargs_encoder.get("attention_mask", None)
+        if encoder_hidden_states is None:
+            # Convert encoder inputs in embeddings if needed
+            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
+            if hidden_states is None:
+                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
+                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+
+            if encoder_attention_mask is not None:
+                # Apply masking
+                encoder_attention_mask = (encoder_attention_mask != 0).to(hidden_states)
+                hidden_states = hidden_states * encoder_attention_mask.unsqueeze(-1)
+
+            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
+            encoder_hidden_states = encoder_outputs[0]
+        else:
+            encoder_outputs = ()
+
+        # Decode
+        # Convert decoder inputs in embeddings if needed
+        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
+        if hidden_states is None:
+            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
+            hidden_states = self.shared(decoder_inputs_ids)
+
+        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
+        kwargs_decoder["encoder_attention_mask"] = encoder_attention_mask
+        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
+
+        return decoder_outputs + encoder_outputs
+
+
+@add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
+    T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+class T5WithLMHeadModel(T5PreTrainedModel):
+    r"""
+        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        model = T5WithLMHeadModel.from_pretrained('t5-small')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids=input_ids, lm_labels=input_ids)
+        loss, prediction_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(T5WithLMHeadModel, self).__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        self.encoder = T5Stack(encoder_config)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        self.decoder = T5Stack(decoder_config)
+
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def forward(self, **kwargs):
+        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
+        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
+        # that apply to the model as whole.
+        # We let the specific kwargs override the common ones in case of conflict.
+
+        lm_labels = kwargs.pop('decoder_lm_labels', None)
+
+        kwargs_common = dict((k, v) for k, v in kwargs.items()
+                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_encoder = kwargs_common.copy()
+        kwargs_decoder = kwargs_common.copy()
+        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+
+        # Encode if needed (training, first prediction pass)
+        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
+        if encoder_hidden_states is None:
+            # Convert encoder inputs in embeddings if needed
+            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
+            if hidden_states is None:
+                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
+                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+
+            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
+            encoder_hidden_states = encoder_outputs[0]
+        else:
+            encoder_outputs = ()
+
+        # Decode
+        # Convert decoder inputs in embeddings if needed
+        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
+        if hidden_states is None:
+            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
+            hidden_states = self.shared(decoder_inputs_ids)
+
+        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
+        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
+        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
+
+        sequence_output = decoder_outputs[0]
+        # Rescale output before projecting on vocab
+        # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+        sequence_output = sequence_output * (self.model_dim ** -0.5)
+        lm_logits = self.lm_head(sequence_output)
+
+        decoder_outputs = (lm_logits,) + decoder_outputs[1:]  # Add hidden states and attention if they are here
+        if lm_labels is not None:
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+            decoder_outputs = (loss,) + decoder_outputs  # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+
+        return decoder_outputs + encoder_outputs
diff --git a/transformers/modeling_tf_albert.py b/transformers/modeling_tf_albert.py
index d1650d41a8..ac55a73fa3 100644
--- a/transformers/modeling_tf_albert.py
+++ b/transformers/modeling_tf_albert.py
@@ -587,8 +587,8 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
         import tensorflow as tf
         from transformers import AlbertTokenizer, TFAlbertModel
 
-        tokenizer = AlbertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFAlbertModel.from_pretrained('bert-base-uncased')
+        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1')
+        model = TFAlbertModel.from_pretrained('albert-base-v1')
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py
index cfe19ead2a..031ffea17e 100644
--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -18,21 +18,48 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
-from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, TFBertForQuestionAnswering
-from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel
-from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel
-from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel
-from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, TFXLNetForQuestionAnsweringSimple
-from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple
-from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification
-from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification
-from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel
+from .configuration_auto import (BertConfig, CTRLConfig, DistilBertConfig,
+                                 GPT2Config, OpenAIGPTConfig, RobertaConfig,
+                                 TransfoXLConfig, XLMConfig, XLNetConfig)
+
+from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, \
+    TFBertForQuestionAnswering, TFBertForTokenClassification, TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel, TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, \
+    TFXLNetForQuestionAnsweringSimple, TFXLNetForTokenClassification, TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, \
+    TFXLMForQuestionAnsweringSimple, TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, \
+    TFRobertaForTokenClassification, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TFDistilBertForTokenClassification, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_albert import TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification, TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
 
+TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
+    for pretrained_map in [
+        TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ]
+    for key, value, in pretrained_map.items())
+
+
 class TFAutoModel(object):
     r"""
         :class:`~transformers.TFAutoModel` is a generic model class
@@ -45,6 +72,7 @@ class TFAutoModel(object):
 
         The base model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: TFT5Model (T5 model)
             - contains `distilbert`: TFDistilBertModel (DistilBERT model)
             - contains `roberta`: TFRobertaModel (RoBERTa model)
             - contains `bert`: TFBertModel (Bert model)
@@ -59,7 +87,50 @@ class TFAutoModel(object):
     """
     def __init__(self):
         raise EnvironmentError("TFAutoModel is designed to be instantiated "
-            "using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` method.")
+            "using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` or "
+            "`TFAutoModel.from_config(config)` methods.")
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                The model class to instantiate is selected based on the configuration class:
+                    - isInstance of `distilbert` configuration class: TFDistilBertModel (DistilBERT model)
+                    - isInstance of `roberta` configuration class: TFRobertaModel (RoBERTa model)
+                    - isInstance of `bert` configuration class: TFBertModel (Bert model)
+                    - isInstance of `openai-gpt` configuration class: TFOpenAIGPTModel (OpenAI GPT model)
+                    - isInstance of `gpt2` configuration class: TFGPT2Model (OpenAI GPT-2 model)
+                    - isInstance of `ctrl` configuration class: TFCTRLModel (Salesforce CTRL  model)
+                    - isInstance of `transfo-xl` configuration class: TFTransfoXLModel (Transformer-XL model)
+                    - isInstance of `xlnet` configuration class: TFXLNetModel (XLNet model)
+                    - isInstance of `xlm` configuration class: TFXLMModel (XLM model)
+
+        Examples::
+
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            model = TFAutoModel.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        if isinstance(config, DistilBertConfig):
+            return TFDistilBertModel(config)
+        elif isinstance(config, RobertaConfig):
+            return TFRobertaModel(config)
+        elif isinstance(config, BertConfig):
+            return TFBertModel(config)
+        elif isinstance(config, OpenAIGPTConfig):
+            return TFOpenAIGPTModel(config)
+        elif isinstance(config, GPT2Config):
+            return TFGPT2Model(config)
+        elif isinstance(config, TransfoXLConfig):
+            return TFTransfoXLModel(config)
+        elif isinstance(config, XLNetConfig):
+            return TFXLNetModel(config)
+        elif isinstance(config, XLMConfig):
+            return TFXLMModel(config)
+        elif isinstance(config, CTRLConfig):
+            return TFCTRLModel(config)
+        raise ValueError("Unrecognized configuration class {}".format(config))
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -68,6 +139,7 @@ class TFAutoModel(object):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: TFT5Model (T5 model)
             - contains `distilbert`: TFDistilBertModel (DistilBERT model)
             - contains `roberta`: TFRobertaModel (RoBERTa model)
             - contains `bert`: TFTFBertModel (Bert model)
@@ -81,6 +153,7 @@ class TFAutoModel(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
@@ -136,8 +209,12 @@ class TFAutoModel(object):
             model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return TFT5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
             return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'albert' in pretrained_model_name_or_path:
+            return TFAlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
@@ -156,7 +233,7 @@ class TFAutoModel(object):
             return TFCTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
                          "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
 
 
@@ -172,6 +249,7 @@ class TFAutoModelWithLMHead(object):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: TFT5WithLMHeadModel (T5 model)
             - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
             - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
             - contains `bert`: TFBertForMaskedLM (Bert model)
@@ -186,7 +264,50 @@ class TFAutoModelWithLMHead(object):
     """
     def __init__(self):
         raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
-            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or "
+            "`TFAutoModelWithLMHead.from_config(config)` methods.")
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                The model class to instantiate is selected based on the configuration class:
+                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
+                    - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model)
+                    - isInstance of `bert` configuration class: BertModel (Bert model)
+                    - isInstance of `openai-gpt` configuration class: OpenAIGPTModel (OpenAI GPT model)
+                    - isInstance of `gpt2` configuration class: GPT2Model (OpenAI GPT-2 model)
+                    - isInstance of `ctrl` configuration class: CTRLModel (Salesforce CTRL  model)
+                    - isInstance of `transfo-xl` configuration class: TransfoXLModel (Transformer-XL model)
+                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
+                    - isInstance of `xlm` configuration class: XLMModel (XLM model)
+
+        Examples::
+
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            model = AutoModelWithLMHead.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        if isinstance(config, DistilBertConfig):
+            return TFDistilBertForMaskedLM(config)
+        elif isinstance(config, RobertaConfig):
+            return TFRobertaForMaskedLM(config)
+        elif isinstance(config, BertConfig):
+            return TFBertForMaskedLM(config)
+        elif isinstance(config, OpenAIGPTConfig):
+            return TFOpenAIGPTLMHeadModel(config)
+        elif isinstance(config, GPT2Config):
+            return TFGPT2LMHeadModel(config)
+        elif isinstance(config, TransfoXLConfig):
+            return TFTransfoXLLMHeadModel(config)
+        elif isinstance(config, XLNetConfig):
+            return TFXLNetLMHeadModel(config)
+        elif isinstance(config, XLMConfig):
+            return TFXLMWithLMHeadModel(config)
+        elif isinstance(config, CTRLConfig):
+            return TFCTRLLMHeadModel(config)
+        raise ValueError("Unrecognized configuration class {}".format(config))
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -198,6 +319,7 @@ class TFAutoModelWithLMHead(object):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: TFT5WithLMHeadModel (T5 model)
             - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
             - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
             - contains `bert`: TFBertForMaskedLM (Bert model)
@@ -212,6 +334,7 @@ class TFAutoModelWithLMHead(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
@@ -267,8 +390,12 @@ class TFAutoModelWithLMHead(object):
             model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return TFT5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
             return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'albert' in pretrained_model_name_or_path:
+            return TFAlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
@@ -287,7 +414,7 @@ class TFAutoModelWithLMHead(object):
             return TFCTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
                          "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
 
 
@@ -312,8 +439,39 @@ class TFAutoModelForSequenceClassification(object):
         This class cannot be instantiated using `__init__()` (throws an error).
     """
     def __init__(self):
-        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
-            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+        raise EnvironmentError("TFAutoModelForSequenceClassification is designed to be instantiated "
+            "using the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or "
+            "`TFAutoModelForSequenceClassification.from_config(config)` methods.")
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                The model class to instantiate is selected based on the configuration class:
+                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
+                    - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model)
+                    - isInstance of `bert` configuration class: BertModel (Bert model)
+                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
+                    - isInstance of `xlm` configuration class: XLMModel (XLM model)
+
+        Examples::
+
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            model = AutoModelForSequenceClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        if isinstance(config, DistilBertConfig):
+            return TFDistilBertForSequenceClassification(config)
+        elif isinstance(config, RobertaConfig):
+            return TFRobertaForSequenceClassification(config)
+        elif isinstance(config, BertConfig):
+            return TFBertForSequenceClassification(config)
+        elif isinstance(config, XLNetConfig):
+            return TFXLNetForSequenceClassification(config)
+        elif isinstance(config, XLMConfig):
+            return TFXLMForSequenceClassification(config)
+        raise ValueError("Unrecognized configuration class {}".format(config))
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -338,6 +496,7 @@ class TFAutoModelForSequenceClassification(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
@@ -395,6 +554,8 @@ class TFAutoModelForSequenceClassification(object):
         """
         if 'distilbert' in pretrained_model_name_or_path:
             return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'albert' in pretrained_model_name_or_path:
+            return TFAlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
@@ -405,7 +566,7 @@ class TFAutoModelForSequenceClassification(object):
             return TFXLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
+                         "'distilbert', 'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
 
 
 class TFAutoModelForQuestionAnswering(object):
@@ -428,8 +589,36 @@ class TFAutoModelForQuestionAnswering(object):
         This class cannot be instantiated using `__init__()` (throws an error).
     """
     def __init__(self):
-        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
-            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+        raise EnvironmentError("TFAutoModelForQuestionAnswering is designed to be instantiated "
+            "using the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or "
+            "`TFAutoModelForQuestionAnswering.from_config(config)` methods.")
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                The model class to instantiate is selected based on the configuration class:
+                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
+                    - isInstance of `bert` configuration class: BertModel (Bert model)
+                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
+                    - isInstance of `xlm` configuration class: XLMModel (XLM model)
+
+        Examples::
+
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            model = AutoModelForSequenceClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        if isinstance(config, DistilBertConfig):
+            return TFDistilBertForQuestionAnswering(config)
+        elif isinstance(config, BertConfig):
+            return TFBertForQuestionAnswering(config)
+        elif isinstance(config, XLNetConfig):
+            return TFXLNetForQuestionAnswering(config)
+        elif isinstance(config, XLMConfig):
+            return TFXLMForQuestionAnswering(config)
+        raise ValueError("Unrecognized configuration class {}".format(config))
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -453,6 +642,7 @@ class TFAutoModelForQuestionAnswering(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
@@ -518,4 +708,121 @@ class TFAutoModelForQuestionAnswering(object):
             return TFXLMForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
+                         "'distilbert', 'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
+
+
+class TFAutoModelForTokenClassification:
+    def __init__(self):
+        raise EnvironmentError("TFAutoModelForTokenClassification is designed to be instantiated "
+                               "using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
+                               "`AutoModelForTokenClassification.from_config(config)` methods.")
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                The model class to instantiate is selected based on the configuration class:
+                    - isInstance of `bert` configuration class: BertModel (Bert model)
+                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
+                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBert model)
+                    - isInstance of `roberta` configuration class: RobteraModel (Roberta model)
+
+        Examples::
+
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            model = TFAutoModelForTokenClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        if isinstance(config, BertConfig):
+            return TFBertForTokenClassification(config)
+        elif isinstance(config, XLNetConfig):
+            return TFXLNetForTokenClassification(config)
+        elif isinstance(config, DistilBertConfig):
+            return TFDistilBertForTokenClassification(config)
+        elif isinstance(config, RobertaConfig):
+            return TFRobertaForTokenClassification(config)
+        raise ValueError("Unrecognized configuration class {}".format(config))
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the question answering model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertForTokenClassification (Bert model)
+            - contains `xlnet`: XLNetForTokenClassification (XLNet model)
+            - contains `distilbert`: DistilBertForTokenClassification (DistilBert model)
+            - contains `roberta`: RobertaForTokenClassification (Roberta model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = TFAutoModelForTokenClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = TFAutoModelForTokenClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = TFAutoModelForTokenClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = TFAutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'bert' in pretrained_model_name_or_path:
+            return TFBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return TFXLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
+            return TFDistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return TFRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'xlnet', 'distilbert', 'roberta'".format(pretrained_model_name_or_path))
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
index 5aa7bb3da2..9caad53a5f 100644
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -48,6 +48,12 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
+    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
+    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
+    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
+    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
+    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
+    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
 }
 
 
@@ -129,7 +135,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
                 linear tensor, float32 with shape [batch_size, length, vocab_size].
         Raises:
             ValueError: if mode is not valid.
-        
+
         Shared weights logic adapted from
             https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
         """
@@ -148,7 +154,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
             input_shape = shape_list(input_ids)
         else:
             input_shape = shape_list(inputs_embeds)[:-1]
-        
+
         seq_length = input_shape[1]
         if position_ids is None:
             position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
@@ -246,7 +252,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
         context_layer = tf.matmul(attention_probs, value_layer)
 
         context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(context_layer, 
+        context_layer = tf.reshape(context_layer,
                                   (batch_size, -1, self.all_head_size))  # (batch_size, seq_len_q, all_head_size)
 
         outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
@@ -591,7 +597,7 @@ BERT_START_DOCSTRING = r"""    The BERT model was proposed in
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -605,13 +611,13 @@ BERT_INPUTS_DOCSTRING = r"""
             (a) For sequence pairs:
 
                 ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-                
+
                 ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
 
             (b) For single sequences:
 
                 ``tokens:         [CLS] the dog is hairy . [SEP]``
-                
+
                 ``token_type_ids:   0   0   0   0  0     0   0``
 
             Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
@@ -671,7 +677,7 @@ class TFBertModel(TFBertPreTrainedModel):
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertModel.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
@@ -710,7 +716,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForPreTraining.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         prediction_scores, seq_relationship_scores = outputs[:2]
 
@@ -759,7 +765,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         prediction_scores = outputs[0]
 
@@ -806,7 +812,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         seq_relationship_scores = outputs[0]
 
@@ -851,7 +857,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         logits = outputs[0]
 
@@ -988,7 +994,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         scores = outputs[0]
 
@@ -1041,7 +1047,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForQuestionAnswering.from_pretrained('bert-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         start_scores, end_scores = outputs[:2]
 
diff --git a/transformers/modeling_tf_ctrl.py b/transformers/modeling_tf_ctrl.py
index 29ee5113a4..0f9b34924f 100644
--- a/transformers/modeling_tf_ctrl.py
+++ b/transformers/modeling_tf_ctrl.py
@@ -418,7 +418,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
 
         tokenizer = CTRLTokenizer.from_pretrained('ctrl')
         model = TFCTRLModel.from_pretrained('ctrl')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
@@ -481,7 +481,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
         tokenizer = CTRLTokenizer.from_pretrained('ctrl')
         model = TFCTRLLMHeadModel.from_pretrained('ctrl')
 
-        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids, labels=input_ids)
         loss, logits = outputs[:2]
 
diff --git a/transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py
index c738e5e8e3..718e8f6058 100644
--- a/transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -454,7 +454,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
 
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = TFGPT2Model.from_pretrained('gpt2')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
@@ -495,7 +495,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = TFGPT2LMHeadModel.from_pretrained('gpt2')
 
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         logits = outputs[0]
 
@@ -574,6 +574,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
     """
     def __init__(self, config, *inputs, **kwargs):
         super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
+        config.num_labels = 1
         self.transformer = TFGPT2MainLayer(config, name='transformer')
         self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
 
diff --git a/transformers/modeling_tf_openai.py b/transformers/modeling_tf_openai.py
index dac3b17590..791c6dcc18 100644
--- a/transformers/modeling_tf_openai.py
+++ b/transformers/modeling_tf_openai.py
@@ -431,7 +431,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
 
         tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
         model = TFOpenAIGPTModel.from_pretrained('openai-gpt')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
@@ -467,7 +467,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
 
         tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
         model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         logits = outputs[0]
 
@@ -538,6 +538,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
     """
     def __init__(self, config, *inputs, **kwargs):
         super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
+        config.num_labels = 1
         self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
         self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
 
diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py
index aa74fcc10e..190caff18d 100644
--- a/transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -78,6 +78,7 @@ def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_i
     logger.info("Loading PyTorch weights from {}".format(pt_path))
 
     pt_state_dict = torch.load(pt_path, map_location='cpu')
+    logger.info("PyTorch checkpoint contains {:,} parameters".format(sum(t.numel() for t in pt_state_dict.values())))
 
     return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
 
@@ -118,9 +119,6 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
             new_key = key.replace('gamma', 'weight')
         if 'beta' in key:
             new_key = key.replace('beta', 'bias')
-        # DialoGPT format
-        if key == 'lm_head.decoder.weight':
-            new_key = 'lm_head.weight'
         if new_key:
             old_keys.append(key)
             new_keys.append(new_key)
@@ -134,7 +132,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
         start_prefix_to_remove = tf_model.base_model_prefix + '.'
 
     symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
-
+    tf_loaded_numel = 0
     weight_value_tuples = []
     all_pytorch_weights = set(list(pt_state_dict.keys()))
     for symbolic_weight in symbolic_weights:
@@ -142,7 +140,11 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
         name, transpose = convert_tf_weight_name_to_pt_weight_name(sw_name, start_prefix_to_remove=start_prefix_to_remove)
 
         # Find associated numpy array in pytorch model state dict
-        assert name in pt_state_dict, "{} not found in PyTorch model".format(name)
+        if name not in pt_state_dict:
+            if allow_missing_keys:
+                continue
+            raise AttributeError("{} not found in PyTorch model".format(name))
+
         array = pt_state_dict[name].numpy()
 
         if transpose:
@@ -159,7 +161,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
             e.args += (symbolic_weight.shape, array.shape)
             raise e
 
-        logger.info("Initialize TF weight {}".format(symbolic_weight.name))
+        tf_loaded_numel += array.size
+        # logger.warning("Initialize TF weight {}".format(symbolic_weight.name))
 
         weight_value_tuples.append((symbolic_weight, array))
         all_pytorch_weights.discard(name)
@@ -169,6 +172,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
     if tf_inputs is not None:
         tfo = tf_model(tf_inputs, training=False)  # Make sure restore ops are run
 
+    logger.info("Loaded {:,} parameters in the TF 2.0 model.".format(tf_loaded_numel))
+
     logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights))
 
     return tf_model
@@ -246,6 +251,7 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
 
     all_tf_weights = set(list(tf_weights_map.keys()))
     loaded_pt_weights_data_ptr = {}
+    missing_keys_pt = []
     for pt_weight_name, pt_weight in current_pt_params_dict.items():
         # Handle PyTorch shared weight ()not duplicated in TF 2.0
         if pt_weight.data_ptr() in loaded_pt_weights_data_ptr:
@@ -254,7 +260,10 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
 
         # Find associated numpy array in pytorch model state dict
         if pt_weight_name not in tf_weights_map:
-            raise ValueError("{} not found in TF 2.0 model".format(pt_weight_name))
+            if allow_missing_keys:
+                missing_keys_pt.append(pt_weight_name)
+                continue
+            raise AttributeError("{} not found in TF 2.0 model".format(pt_weight_name))
 
         array, transpose = tf_weights_map[pt_weight_name]
 
@@ -272,13 +281,14 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
             e.args += (pt_weight.shape, array.shape)
             raise e
 
-        logger.info("Initialize PyTorch weight {}".format(pt_weight_name))
+        # logger.warning("Initialize PyTorch weight {}".format(pt_weight_name))
 
         new_pt_params_dict[pt_weight_name] = torch.from_numpy(array)
         loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array)
         all_tf_weights.discard(pt_weight_name)
 
     missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False)
+    missing_keys += missing_keys_pt
 
     if len(missing_keys) > 0:
         logger.info("Weights of {} not initialized from TF 2.0 model: {}".format(
diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py
index 954279f873..ad730b5804 100644
--- a/transformers/modeling_tf_roberta.py
+++ b/transformers/modeling_tf_roberta.py
@@ -199,7 +199,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
 
         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
         model = TFRobertaModel.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
@@ -276,7 +276,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
 
         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
         model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids, masked_lm_labels=input_ids)
         prediction_scores = outputs[0]
 
@@ -347,7 +347,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
 
         tokenizer = RoertaTokenizer.from_pretrained('roberta-base')
         model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         labels = tf.constant([1])[None, :]  # Batch size 1
         outputs = model(input_ids)
         logits = outputs[0]
diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py
new file mode 100644
index 0000000000..e803e00c8d
--- /dev/null
+++ b/transformers/modeling_tf_t5.py
@@ -0,0 +1,775 @@
+# coding=utf-8
+# Copyright 2018 T5 Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 T5 model. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+import math
+import copy
+import itertools
+
+import tensorflow as tf
+
+from .configuration_t5 import T5Config
+from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list
+from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
+
+logger = logging.getLogger(__name__)
+
+TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5",
+    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5",
+    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5",
+    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-tf_model.h5",
+    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-tf_model.h5",
+}
+
+####################################################
+# TF 2.0 Models are constructed using Keras imperative API by sub-classing
+# - tf.keras.layers.Layer for the layers and
+# - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model)
+####################################################
+
+class TFT5LayerNorm(tf.keras.layers.Layer):
+    def __init__(self, epsilon=1e-6, **kwargs):
+        """ Construct a layernorm module in the T5 style
+            No bias and no substraction of mean.
+        """
+        super(TFT5LayerNorm, self).__init__(**kwargs)
+        self.variance_epsilon = epsilon
+
+    def build(self, input_shape):
+        """Build shared word embedding layer """
+        self.weight = self.add_weight(
+            "weight",
+            shape=(input_shape[-1],),
+            initializer='ones')
+        super(TFT5LayerNorm, self).build(input_shape)
+
+    def call(self, x):
+        variance = tf.math.reduce_mean(tf.math.square(x), axis=-1, keepdims=True)
+        x = x * tf.math.rsqrt(variance + self.variance_epsilon)
+        return self.weight * x
+
+
+class TFT5DenseReluDense(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFT5DenseReluDense, self).__init__(**kwargs)
+        self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name='wi')
+        self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name='wo')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+        self.act = tf.keras.activations.relu
+
+    def call(self, hidden_states, training=False):
+        h = self.wi(hidden_states)
+        h = self.act(h)
+        h = self.dropout(h, training=training)
+        h = self.wo(h)
+        return h
+
+
+class TFT5LayerFF(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFT5LayerFF, self).__init__(**kwargs)
+        self.DenseReluDense = TFT5DenseReluDense(config, name='DenseReluDense')
+        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
+                                        name='layer_norm')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+
+    def call(self, hidden_states, training=False):
+        norm_x = self.layer_norm(hidden_states)
+        y = self.DenseReluDense(norm_x, training=training)
+        layer_output = hidden_states + self.dropout(y, training=training)
+        return layer_output
+
+
+class TFT5Attention(tf.keras.layers.Layer):
+    NEW_ID = itertools.count()
+
+    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
+        super(TFT5Attention, self).__init__(**kwargs)
+        self.layer_id = next(TFT5Attention.NEW_ID)
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+
+        self.output_attentions = config.output_attentions
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.d_model = config.d_model
+        self.d_kv = config.d_kv
+        self.n_heads = config.num_heads
+        self.inner_dim = self.n_heads * self.d_kv
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='q')
+        self.k = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='k')
+        self.v = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='v')
+        self.o = tf.keras.layers.Dense(self.d_model, use_bias=False, name='o')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = tf.keras.layers.Embedding(self.relative_attention_num_buckets,
+                                                                     self.n_heads,
+                                                                     name='relative_attention_bias')
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    @staticmethod
+    def _relative_position_bucket(relative_position,
+                                  bidirectional=True,
+                                  num_buckets=32,
+                                  max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention.
+        The relative position is defined as memory_position - query_position, i.e.
+        the distance in tokens from the attending position to the attended-to
+        position.  If bidirectional=False, then positive relative positions are
+        invalid.
+        We use smaller buckets for small absolute relative_position and larger buckets
+        for larger absolute relative_positions.  All relative positions >=max_distance
+        map to the same bucket.  All relative positions <=-max_distance map to the
+        same bucket.  This should allow for more graceful generalization to longer
+        sequences than the model has been trained on.
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32
+            values in the range [0, num_buckets)
+        """
+        ret = 0
+        n = -relative_position
+        if bidirectional:
+            num_buckets //= 2
+            ret += tf.dtypes.cast(tf.math.less(n, 0), tf.int32) * num_buckets
+            n = tf.math.abs(n)
+        else:
+            n = tf.math.maximum(n, 0)
+        # now n is in the range [0, inf)
+        max_exact = num_buckets // 2
+        is_small = tf.math.less(n, max_exact)
+        val_if_large = max_exact + tf.dtypes.cast(
+            tf.math.log(tf.dtypes.cast(n, tf.float32) / max_exact)
+            / math.log(max_distance / max_exact) * (num_buckets - max_exact), tf.int32)
+        val_if_large = tf.math.minimum(val_if_large, num_buckets - 1)
+        ret += tf.where(is_small, n, val_if_large)
+        return ret
+
+    def compute_bias(self, qlen, klen):
+        """ Compute binned relative position bias """
+        context_position = tf.range(qlen)[:, None]
+        memory_position = tf.range(klen)[None, :]
+        relative_position = memory_position - context_position  # shape (qlen, klen)
+        rp_bucket = self._relative_position_bucket(relative_position,
+                                                   bidirectional=not self.is_decoder,
+                                                   num_buckets=self.relative_attention_num_buckets)
+        values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
+        values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0) # shape (1, num_heads, qlen, klen)
+        return values
+
+    def call(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None, training=False):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = shape_list(input)
+        if kv is None:
+            klen = qlen if cache is None else cache['slen'] + qlen
+        else:
+            klen = shape_list(kv)[1]
+
+        def shape(x):
+            """  projection """
+            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, self.d_kv)), perm=(0, 2, 1, 3))
+
+        def unshape(x):
+            """  compute context """
+            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.inner_dim))
+
+        q = shape(self.q(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        if kv is None:
+            k = shape(self.k(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(input))                                      # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = shape(self.k(k))                                          # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(v))                                          # (bs, n_heads, qlen, dim_per_head)
+
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = tf.concat([k_, k], axis=2)                             # (bs, n_heads, klen, dim_per_head)
+                    v = tf.concat([v_, v], axis=2)                             # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+            cache[self.layer_id] = (k, v)
+
+        # q = q / math.sqrt(dim_per_head)                                     # No scaling in T5
+        # scores = tf.matmul(q, k, transpose_b=True)                            # (bs, n_heads, qlen, klen)
+        scores = tf.einsum('bnqd,bnkd->bnqk', q, k)                        # (bs, n_heads, qlen, klen)
+
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                raise ValueError("No position_bias provided and no weights to compute position_bias")
+            position_bias = self.compute_bias(qlen, klen)
+            if mask is not None:
+                position_bias = position_bias + mask
+                # mask = (mask == 0).expand_as(scores)                              # (bs, n_heads, qlen, klen)
+                # scores.masked_fill_(mask, -float('inf'))                          # (bs, n_heads, qlen, klen)
+
+        scores += position_bias
+        weights = tf.nn.softmax(scores, axis=-1)                              # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)                    # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = tf.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)                                            # (bs, qlen, dim)
+
+        context = self.o(context)
+
+        outputs = (context,)
+        if self.output_attentions:
+            outputs = outputs + (weights,)
+        if self.has_relative_attention_bias:
+            outputs = outputs + (position_bias,)
+        return outputs
+
+
+class TFT5LayerSelfAttention(tf.keras.layers.Layer):
+    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
+        super(TFT5LayerSelfAttention, self).__init__(**kwargs)
+        self.SelfAttention = TFT5Attention(config,
+                                           has_relative_attention_bias=has_relative_attention_bias,
+                                           name='SelfAttention')
+        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
+                                        name='layer_norm')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+
+    def call(self, hidden_states, attention_mask=None, position_bias=None,
+             head_mask=None, training=False):
+        norm_x = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(norm_x,
+                                              mask=attention_mask,
+                                              position_bias=position_bias,
+                                              head_mask=head_mask,
+                                              training=training)
+        y = attention_output[0]
+        layer_output = hidden_states + self.dropout(y, training=training)
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class TFT5LayerCrossAttention(tf.keras.layers.Layer):
+    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
+        super(TFT5LayerCrossAttention, self).__init__(**kwargs)
+        self.EncDecAttention = TFT5Attention(config,
+                                           has_relative_attention_bias=has_relative_attention_bias,
+                                           name='EncDecAttention')
+        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
+                                        name='layer_norm')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+
+    def call(self, hidden_states, kv, attention_mask=None, position_bias=None,
+             head_mask=None, training=False):
+        norm_x = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(norm_x,
+                                                mask=attention_mask,
+                                                kv=kv,
+                                                position_bias=position_bias,
+                                                head_mask=head_mask,
+                                                training=training)
+        y = attention_output[0]
+        layer_output = hidden_states + self.dropout(y, training=training)
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class TFT5Block(tf.keras.layers.Layer):
+    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
+        super(TFT5Block, self).__init__(**kwargs)
+        self.is_decoder = config.is_decoder
+        self.layer = []
+        self.layer.append(TFT5LayerSelfAttention(config,
+                                                 has_relative_attention_bias=has_relative_attention_bias,
+                                                 name='layer_._0'))
+        if self.is_decoder:
+            self.layer.append(TFT5LayerCrossAttention(config,
+                                                      has_relative_attention_bias=has_relative_attention_bias,
+                                                      name='layer_._1'))
+            self.layer.append(TFT5LayerFF(config, name='layer_._2'))
+        else:
+            self.layer.append(TFT5LayerFF(config, name='layer_._1'))
+
+    def call(self, hidden_states, attention_mask=None, position_bias=None,
+             encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
+             head_mask=None, training=False):
+        self_attention_outputs = self.layer[0](hidden_states,
+                                                attention_mask=attention_mask,
+                                                position_bias=position_bias,
+                                                head_mask=head_mask,
+                                                training=training)
+        hidden_states = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]
+
+        if not self.is_decoder:
+            hidden_states = self.layer[1](hidden_states, training=training)
+        else:
+            cross_attention_outputs = self.layer[1](hidden_states,
+                                                    kv=encoder_hidden_states,
+                                                    attention_mask=encoder_attention_mask,
+                                                    position_bias=encoder_decoder_position_bias,
+                                                    head_mask=head_mask,
+                                                    training=training)
+            hidden_states = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:]
+            hidden_states = self.layer[2](hidden_states, training=training)
+
+        outputs = (hidden_states,) + outputs  # add attentions if we output them
+        return outputs # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+
+
+####################################################
+# The full model without a specific pretrained or finetuning head is
+# provided as a tf.keras.layers.Layer usually called "TFT5MainLayer"
+####################################################
+class TFT5MainLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFT5MainLayer, self).__init__(**kwargs)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.is_decoder = config.is_decoder
+        self.config = config
+        self.num_hidden_layers = config.num_layers
+
+        self.block = [TFT5Block(config,
+                                has_relative_attention_bias=bool(i == 0),
+                                name='block_._{}'.format(i))
+                        for i in range(config.num_layers)]
+        self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
+                                              name='final_layer_norm')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
+
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
+
+    def call(self, hidden_states, attention_mask=None, encoder_hidden_states=None,
+             encoder_attention_mask=None, head_mask=None, training=False):
+
+        batch_size, seq_length = shape_list(hidden_states)[:2]
+        if attention_mask is None:
+            attention_mask = tf.fill((batch_size, seq_length), 1)
+        if self.is_decoder and encoder_attention_mask is None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = tf.fill((batch_size, encoder_seq_length), 1)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        attention_mask = tf.cast(attention_mask, dtype=tf.float32)
+        num_dims_attention_mask = len(shape_list(attention_mask))
+        if num_dims_attention_mask == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif num_dims_attention_mask == 2:
+        # Provided a padding mask of dimensions [batch_size, seq_length]
+        # - if the model is a decoder, apply a causal mask in addition to the padding mask
+        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if self.config.is_decoder:
+                seq_ids = tf.range(seq_length)
+                causal_mask = tf.less_equal(tf.tile(seq_ids[None, None, :], (batch_size, seq_length, 1)),
+                                            seq_ids[None, :, None])
+                causal_mask = tf.cast(causal_mask, dtype=tf.float32)
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+
+        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion
+        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+        # extended_attention_mask = tf.math.equal(extended_attention_mask,
+        #                                         tf.transpose(extended_attention_mask, perm=(-1, -2)))
+
+        extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
+
+        if self.is_decoder:
+            # If a 2D ou 3D attention mask is provided for the cross-attention
+            # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+            encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=tf.float32)
+            num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
+            if num_dims_encoder_attention_mask == 3:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+            if num_dims_encoder_attention_mask == 2:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+
+            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion
+            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+            # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
+            #                                         tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
+
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if not head_mask is None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_hidden_layers
+            # head_mask = tf.constant([0] * self.num_hidden_layers)
+
+        all_hidden_states = ()
+        all_attentions = ()
+        position_bias = None
+        encoder_decoder_position_bias = None
+        for i, layer_module in enumerate(self.block):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(hidden_states,
+                                         attention_mask=extended_attention_mask,
+                                         position_bias=position_bias,
+                                         encoder_hidden_states=encoder_hidden_states,
+                                         encoder_attention_mask=encoder_extended_attention_mask,
+                                         encoder_decoder_position_bias=encoder_decoder_position_bias,
+                                         head_mask=head_mask[i],
+                                         training=training)
+            hidden_states = layer_outputs[0]
+            if i == 0:
+                # We share the position biases between the layers - the first layer store them
+                # layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+                position_bias = layer_outputs[2 if self.output_attentions else 1]
+                if self.is_decoder:
+                    encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2]
+
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        layer_output = self.dropout(hidden_states, training=training)
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+
+
+####################################################
+# TFT5PreTrainedModel is a sub-class of tf.keras.Model
+# which take care of loading and saving pretrained weights
+# and various common utilities.
+# Here you just need to specify a few (self-explanatory)
+# pointers for your model.
+####################################################
+class TFT5PreTrainedModel(TFPreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = T5Config
+    pretrained_model_archive_map = TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "transformer"
+
+    @property
+    def dummy_inputs(self):
+        input_ids = tf.constant(DUMMY_INPUTS)
+        input_mask = tf.constant(DUMMY_MASK)
+        dummy_inputs = {'decoder_input_ids': input_ids,
+                        'encoder_input_ids': input_ids,
+                        'decoder_attention_mask': input_mask}
+        return dummy_inputs
+
+
+T5_START_DOCSTRING = r"""    The T5 model was proposed in
+    `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
+    by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
+    It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting.
+
+    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+
+    .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
+        https://arxiv.org/abs/1910.10683
+
+    .. _`tf.keras.Model`:
+        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
+
+    Note on the model inputs:
+        TF 2.0 models accepts two formats as inputs:
+
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+
+        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
+
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
+
+        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
+            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+
+    Parameters:
+        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+T5_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+
+            (b) For single sequences:
+
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+
+
+            T5 is a model with relative position embeddings so you should be able to pad the inputs on
+            the right or the left.
+
+            Indices can be obtained using :class:`transformers.T5Tokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states"
+                      "without any specific head on top.",
+                      T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+class TFT5Model(TFT5PreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import T5Tokenizer, TFT5Model
+
+        tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        model = TFT5Model.from_pretrained('t5-small')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids=input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFT5Model, self).__init__(config, *inputs, **kwargs)
+        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model,
+                                         name='shared')
+
+        encoder_config = copy.deepcopy(config)
+        self.encoder = TFT5MainLayer(encoder_config, name='encoder')
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        self.decoder = TFT5MainLayer(decoder_config, name='decoder')
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def get_output_embeddings(self):
+        return self.shared
+
+    def call(self, decoder_input_ids, **kwargs):
+        # We allow two types of multi-inputs:
+        # - traditional keyword arguments in the call method
+        # - all the arguments provided as a dict in the first positional argument of call
+        # The last option is useful to use the tf.keras fit() method.
+
+        if isinstance(decoder_input_ids, dict):
+            kwargs.update(decoder_input_ids)
+        else:
+            kwargs['decoder_input_ids'] = decoder_input_ids
+
+        kwargs_common = dict((k, v) for k, v in kwargs.items()
+                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_encoder = kwargs_common.copy()
+        kwargs_decoder = kwargs_common.copy()
+        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+
+        # Encode if needed (training, first prediction pass)
+        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
+        if encoder_hidden_states is None:
+            # Convert encoder inputs in embeddings if needed
+            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
+            if hidden_states is None:
+                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
+                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+
+            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
+            encoder_hidden_states = encoder_outputs[0]
+        else:
+            encoder_outputs = ()
+
+        # Decode
+        # Convert decoder inputs in embeddings if needed
+        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
+        if hidden_states is None:
+            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
+            hidden_states = self.shared(decoder_inputs_ids)
+
+        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
+        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
+        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
+
+        return decoder_outputs + encoder_outputs
+
+
+@add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
+    T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+class TFT5WithLMHeadModel(TFT5PreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import T5Tokenizer, TFT5WithLMHeadModel
+
+        tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        model = TFT5WithLMHeadModel.from_pretrained('t5-small')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids=input_ids)
+        prediction_scores = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFT5WithLMHeadModel, self).__init__(config, *inputs, **kwargs)
+        self.model_dim = config.d_model
+
+        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model,
+                                         name='shared')
+
+        encoder_config = copy.deepcopy(config)
+        self.encoder = TFT5MainLayer(encoder_config, name='encoder')
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        self.decoder = TFT5MainLayer(decoder_config, name='decoder')
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def get_output_embeddings(self):
+        return self.shared
+
+    def call(self, decoder_input_ids, **kwargs):
+        # We allow two types of multi-inputs:
+        # - traditional keyword arguments in the call method
+        # - all the arguments provided as a dict in the first positional argument of call
+        # The last option is useful to use the tf.keras fit() method.
+
+        if isinstance(decoder_input_ids, dict):
+            kwargs.update(decoder_input_ids)
+        else:
+            kwargs['decoder_input_ids'] = decoder_input_ids
+
+        kwargs_common = dict((k, v) for k, v in kwargs.items()
+                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_encoder = kwargs_common.copy()
+        kwargs_decoder = kwargs_common.copy()
+        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+
+        # Encode if needed (training, first prediction pass)
+        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
+        if encoder_hidden_states is None:
+            # Convert encoder inputs in embeddings if needed
+            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
+            if hidden_states is None:
+                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
+                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+
+            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
+            encoder_hidden_states = encoder_outputs[0]
+        else:
+            encoder_outputs = ()
+
+        # Decode
+        # Convert decoder inputs in embeddings if needed
+        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
+        if hidden_states is None:
+            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
+            hidden_states = self.shared(decoder_inputs_ids)
+
+        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
+        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
+        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
+
+        sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5)
+        lm_logits = self.shared(sequence_output, mode="linear")
+        decoder_outputs = (lm_logits,) + decoder_outputs[1:]
+
+        return decoder_outputs + encoder_outputs
diff --git a/transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py
index fd325e218e..08bbe74032 100644
--- a/transformers/modeling_tf_transfo_xl.py
+++ b/transformers/modeling_tf_transfo_xl.py
@@ -353,7 +353,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
 
-        self.n_token = config.n_token
+        self.n_token = config.vocab_size
 
         self.d_embed = config.d_embed
         self.d_model = config.d_model
@@ -361,7 +361,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
         self.d_head = config.d_head
         self.untie_r = config.untie_r
 
-        self.word_emb = TFAdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, 
+        self.word_emb = TFAdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs, 
                                             div_val=config.div_val, init_std=config.init_std, name='word_emb')
 
         self.drop = tf.keras.layers.Dropout(config.dropout)
@@ -673,7 +673,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
 
         tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
         model = TFTransfoXLModel.from_pretrained('transfo-xl-wt103')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states, mems = outputs[:2]
 
@@ -715,7 +715,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
 
         tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
         model = TFTransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         prediction_scores, mems = outputs[:2]
 
@@ -729,7 +729,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
             raise NotImplementedError
         # use adaptive softmax (including standard softmax)
         else:
-            self.crit = TFAdaptiveSoftmaxMask(config.n_token, config.d_embed, config.d_model, 
+            self.crit = TFAdaptiveSoftmaxMask(config.vocab_size, config.d_embed, config.d_model, 
                                               config.cutoffs, div_val=config.div_val, name='crit')
 
     def reset_length(self, tgt_len, ext_len, mem_len):
diff --git a/transformers/modeling_tf_transfo_xl_utilities.py b/transformers/modeling_tf_transfo_xl_utilities.py
index e6a6dfe686..f730af851f 100644
--- a/transformers/modeling_tf_transfo_xl_utilities.py
+++ b/transformers/modeling_tf_transfo_xl_utilities.py
@@ -25,15 +25,15 @@ import tensorflow as tf
 from .modeling_tf_utils import shape_list
 
 class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
-    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
+    def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1,
                  keep_order=False, **kwargs):
         super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs)
 
-        self.n_token = n_token
+        self.vocab_size = vocab_size
         self.d_embed = d_embed
         self.d_proj = d_proj
 
-        self.cutoffs = cutoffs + [n_token]
+        self.cutoffs = cutoffs + [vocab_size]
         self.cutoff_ends = [0] + self.cutoffs
         self.div_val = div_val
 
@@ -66,11 +66,11 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
                     self.out_projs.append(weight)
                 else:
                     self.out_projs.append(None)
-                weight = self.add_weight(shape=(self.n_token, self.d_embed,),
+                weight = self.add_weight(shape=(self.vocab_size, self.d_embed,),
                                          initializer='zeros',
                                          trainable=True,
                                          name='out_layers_._{}_._weight'.format(i))
-                bias = self.add_weight(shape=(self.n_token,),
+                bias = self.add_weight(shape=(self.vocab_size,),
                                          initializer='zeros',
                                          trainable=True,
                                          name='out_layers_._{}_._bias'.format(i))
@@ -114,7 +114,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
         hidden, target = inputs
         head_logprob = 0
         if self.n_clusters == 0:
-            softmax_b = tf.get_variable('bias', [n_token], initializer=tf.zeros_initializer())
+            softmax_b = tf.get_variable('bias', [self.config.vocab_size], initializer=tf.zeros_initializer())
             output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
             if target is not None:
                 loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py
index ed8fdb74c9..0aa65a9f17 100644
--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -22,15 +22,16 @@ import logging
 import os
 
 import tensorflow as tf
+from tensorflow.python.keras.saving import hdf5_format
+import h5py
 
 from .configuration_utils import PretrainedConfig
-from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
+from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS,
+                         cached_path, hf_bucket_url, is_remote_url)
 from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 
 logger = logging.getLogger(__name__)
 
-DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-
 class TFPreTrainedModel(tf.keras.Model):
     r""" Base class for all TF models.
 
@@ -59,7 +60,7 @@ class TFPreTrainedModel(tf.keras.Model):
         Returns:
             tf.Tensor with dummy inputs
         """
-        return tf.constant(DUMMY_INPUTS)
+        return {'input_ids': tf.constant(DUMMY_INPUTS)}
 
     def __init__(self, config, *inputs, **kwargs):
         super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
@@ -176,13 +177,16 @@ class TFPreTrainedModel(tf.keras.Model):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.
 
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+            config: (`optional`) one of:
+                    - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or
+                    - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()`
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
@@ -206,6 +210,9 @@ class TFPreTrainedModel(tf.keras.Model):
                 A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                 The proxies are used on each request.
 
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
             kwargs: (`optional`) Remaining dictionary of keyword arguments:
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
@@ -229,11 +236,13 @@ class TFPreTrainedModel(tf.keras.Model):
         force_download = kwargs.pop('force_download', False)
         resume_download = kwargs.pop('resume_download', False)
         proxies = kwargs.pop('proxies', None)
+        output_loading_info = kwargs.pop('output_loading_info', False)
 
-        # Load config
-        if config is None:
+        # Load config if we don't provide a configuration
+        if not isinstance(config, PretrainedConfig):
+            config_path = config if config is not None else pretrained_model_name_or_path
             config, model_kwargs = cls.config_class.from_pretrained(
-                pretrained_model_name_or_path, *model_args,
+                config_path, *model_args,
                 cache_dir=cache_dir, return_unused_kwargs=True,
                 force_download=force_download,
                 resume_download=resume_download,
@@ -257,10 +266,14 @@ class TFPreTrainedModel(tf.keras.Model):
                     raise EnvironmentError("Error no file named {} found in directory {} or `from_pt` set to False".format(
                         [WEIGHTS_NAME, TF2_WEIGHTS_NAME],
                         pretrained_model_name_or_path))
-            elif os.path.isfile(pretrained_model_name_or_path):
+            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
+            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
+                archive_file = pretrained_model_name_or_path + ".index"
             else:
-                raise EnvironmentError("Error file {} not found".format(pretrained_model_name_or_path))
+                archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=TF2_WEIGHTS_NAME)
+                if from_pt:
+                    raise EnvironmentError("Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name.")
 
             # redirect to the cache, if necessary
             try:
@@ -293,17 +306,46 @@ class TFPreTrainedModel(tf.keras.Model):
 
         if from_pt:
             # Load from a PyTorch checkpoint
-            return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file)
+            return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True)
 
         ret = model(model.dummy_inputs, training=False)  # build the network with dummy inputs
 
         assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file)
         # 'by_name' allow us to do transfer learning by skipping/adding layers
         # see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357
-        model.load_weights(resolved_archive_file, by_name=True)
+        try:
+            model.load_weights(resolved_archive_file, by_name=True)
+        except OSError:
+            raise OSError("Unable to load weights from h5 file. "
+                          "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. ")
 
         ret = model(model.dummy_inputs, training=False)  # Make sure restore ops are run
 
+        # Check if the models are the same to output loading informations
+        with h5py.File(resolved_archive_file, 'r') as f:
+            if 'layer_names' not in f.attrs and 'model_weights' in f:
+                f = f['model_weights']
+            hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, 'layer_names'))
+        model_layer_names = set(layer.name for layer in model.layers)
+        missing_keys = list(model_layer_names - hdf5_layer_names)
+        unexpected_keys = list(hdf5_layer_names - model_layer_names)
+        error_msgs = []
+
+        if len(missing_keys) > 0:
+            logger.info("Layers of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            logger.info("Layers from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading weights for {}:\n\t{}'.format(
+                            model.__class__.__name__, "\n\t".join(error_msgs)))
+        if output_loading_info:
+            loading_info = {"missing_keys": missing_keys,
+                            "unexpected_keys": unexpected_keys,
+                            "error_msgs": error_msgs}
+            return model, loading_info
+
         return model
 
 class TFConv1D(tf.keras.layers.Layer):
diff --git a/transformers/modeling_tf_xlm.py b/transformers/modeling_tf_xlm.py
index 6f11b0537d..a7cc8ea481 100644
--- a/transformers/modeling_tf_xlm.py
+++ b/transformers/modeling_tf_xlm.py
@@ -460,7 +460,7 @@ class TFXLMPreTrainedModel(TFPreTrainedModel):
             langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
         else:
             langs_list = None
-        return [inputs_list, attns_list, langs_list]
+        return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
 
 
 XLM_START_DOCSTRING = r"""    The XLM model was proposed in
@@ -576,7 +576,7 @@ class TFXLMModel(TFXLMPreTrainedModel):
 
         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
         model = TFXLMModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
@@ -649,7 +649,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
 
         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
         model = TFXLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
@@ -695,7 +695,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
 
         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
         model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         labels = tf.constant([1])[None, :]  # Batch size 1
         outputs = model(input_ids)
         logits = outputs[0]
@@ -743,7 +743,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
 
         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
         model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         start_scores, end_scores = outputs[:2]
 
diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py
index 759b57d835..2f1fe150c6 100644
--- a/transformers/modeling_tf_xlnet.py
+++ b/transformers/modeling_tf_xlnet.py
@@ -366,7 +366,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
         self.use_bfloat16 = config.use_bfloat16
         self.initializer_range = config.initializer_range
 
-        self.word_embedding = TFSharedEmbeddings(config.n_token, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
+        self.word_embedding = TFSharedEmbeddings(config.vocab_size, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
         self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
         self.dropout = tf.keras.layers.Dropout(config.dropout)
 
@@ -552,7 +552,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
         assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) " \
             "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
         if input_mask is None and attention_mask is not None:
-            input_mask = 1.0 - attention_mask
+            input_mask = 1.0 - tf.cast(attention_mask, dtype=dtype_float)
         if input_mask is not None and perm_mask is not None:
             data_mask = input_mask[None] + perm_mask
         elif input_mask is not None and perm_mask is None:
@@ -811,7 +811,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
 
         tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
         model = TFXLNetModel.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
@@ -855,7 +855,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
         model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')
 
         # We show how to setup inputs to predict a next token using a bi-directional context.
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>"))[None, :]  # We will predict the masked token
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :]  # We will predict the masked token
         perm_mask = tf.zeros((1, input_ids.shape[1], input_ids.shape[1]))
         perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
         target_mapping = tf.zeros((1, 1, input_ids.shape[1]))  # Shape [1, 1, seq_length] => let's predict one token
@@ -911,7 +911,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
 
         tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
         model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         logits = outputs[0]
 
@@ -1022,7 +1022,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
 
         tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
         model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         start_scores, end_scores = outputs[:2]
 
@@ -1086,7 +1086,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
 
 #         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
 #         model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
-#         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+#         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
 #         start_positions = tf.constant([1])
 #         end_positions = tf.constant([3])
 #         outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
diff --git a/transformers/modeling_transfo_xl.py b/transformers/modeling_transfo_xl.py
index 473d07f733..488a02655e 100644
--- a/transformers/modeling_transfo_xl.py
+++ b/transformers/modeling_transfo_xl.py
@@ -582,7 +582,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
         tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
         model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states, mems = outputs[:2]
 
@@ -592,14 +592,14 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
 
-        self.n_token = config.n_token
+        self.n_token = config.vocab_size
 
         self.d_embed = config.d_embed
         self.d_model = config.d_model
         self.n_head = config.n_head
         self.d_head = config.d_head
 
-        self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
+        self.word_emb = AdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs,
                                           div_val=config.div_val)
 
         self.drop = nn.Dropout(config.dropout)
@@ -825,7 +825,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
 
         tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
         model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids)
         prediction_scores, mems = outputs[:2]
 
@@ -836,11 +836,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         self.sample_softmax = config.sample_softmax
         # use sampled softmax
         if config.sample_softmax > 0:
-            self.out_layer = nn.Linear(config.d_model, config.n_token)
-            self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
+            self.out_layer = nn.Linear(config.d_model, config.vocab_size)
+            self.sampler = LogUniformSampler(config.vocab_size, config.sample_softmax)
         # use adaptive softmax (including standard softmax)
         else:
-            self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model,
+            self.crit = ProjectedAdaptiveLogSoftmax(config.vocab_size, config.d_embed, config.d_model,
                                                     config.cutoffs, div_val=config.div_val)
         self.init_weights()
 
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index 005252c141..25124f1fda 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -31,11 +31,11 @@ from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 
 from .configuration_utils import PretrainedConfig
-from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
+from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS,
+                         cached_path, hf_bucket_url, is_remote_url)
 
 logger = logging.getLogger(__name__)
 
-
 try:
     from torch.nn import Identity
 except ImportError:
@@ -71,6 +71,15 @@ class PreTrainedModel(nn.Module):
     load_tf_weights = lambda model, config, path: None
     base_model_prefix = ""
 
+    @property
+    def dummy_inputs(self):
+        """ Dummy inputs to do a forward pass in the network.
+
+        Returns:
+            torch.Tensor with dummy inputs
+        """
+        return {'input_ids': torch.tensor(DUMMY_INPUTS)}
+
     def __init__(self, config, *inputs, **kwargs):
         super(PreTrainedModel, self).__init__()
         if not isinstance(config, PretrainedConfig):
@@ -160,8 +169,7 @@ class PreTrainedModel(nn.Module):
         base_model.vocab_size = new_num_tokens
 
         # Tie weights again if needed
-        if hasattr(self, 'tie_weights'):
-            self.tie_weights()
+        self.tie_weights()
 
         return model_embeds
 
@@ -265,6 +273,7 @@ class PreTrainedModel(nn.Module):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
                 - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
@@ -272,7 +281,9 @@ class PreTrainedModel(nn.Module):
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+            config: (`optional`) one of:
+                    - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or
+                    - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()`
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
@@ -318,10 +329,6 @@ class PreTrainedModel(nn.Module):
             model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path:
-            logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " +
-                           "https://github.com/google-research/google-research/issues/119 for more information.")
-
         config = kwargs.pop('config', None)
         state_dict = kwargs.pop('state_dict', None)
         cache_dir = kwargs.pop('cache_dir', None)
@@ -331,10 +338,11 @@ class PreTrainedModel(nn.Module):
         proxies = kwargs.pop('proxies', None)
         output_loading_info = kwargs.pop('output_loading_info', False)
 
-        # Load config
-        if config is None:
+        # Load config if we don't provide a configuration
+        if not isinstance(config, PretrainedConfig):
+            config_path = config if config is not None else pretrained_model_name_or_path
             config, model_kwargs = cls.config_class.from_pretrained(
-                pretrained_model_name_or_path, *model_args,
+                config_path, *model_args,
                 cache_dir=cache_dir, return_unused_kwargs=True,
                 force_download=force_download,
                 resume_download=resume_download,
@@ -362,11 +370,16 @@ class PreTrainedModel(nn.Module):
                     raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
                         [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
                         pretrained_model_name_or_path))
-            elif os.path.isfile(pretrained_model_name_or_path):
+            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
-            else:
-                assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path)
+            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
+                assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
+                    pretrained_model_name_or_path + ".index")
                 archive_file = pretrained_model_name_or_path + ".index"
+            else:
+                archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME)
+                if from_tf:
+                    raise EnvironmentError("Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name.")
 
             # redirect to the cache, if necessary
             try:
@@ -398,7 +411,11 @@ class PreTrainedModel(nn.Module):
         model = cls(config, *model_args, **model_kwargs)
 
         if state_dict is None and not from_tf:
-            state_dict = torch.load(resolved_archive_file, map_location='cpu')
+            try:
+                state_dict = torch.load(resolved_archive_file, map_location='cpu')
+            except:
+                raise OSError("Unable to load weights from pytorch checkpoint file. "
+                              "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. ")
 
         missing_keys = []
         unexpected_keys = []
@@ -427,8 +444,6 @@ class PreTrainedModel(nn.Module):
                     new_key = key.replace('gamma', 'weight')
                 if 'beta' in key:
                     new_key = key.replace('beta', 'bias')
-                if key == 'lm_head.decoder.weight':
-                    new_key = 'lm_head.weight'
                 if new_key:
                     old_keys.append(key)
                     new_keys.append(new_key)
@@ -470,8 +485,7 @@ class PreTrainedModel(nn.Module):
                 raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
                                 model.__class__.__name__, "\n\t".join(error_msgs)))
 
-        if hasattr(model, 'tie_weights'):
-            model.tie_weights()  # make sure word embedding weights are still tied
+        model.tie_weights()  # make sure word embedding weights are still tied if needed
 
         # Set model in evaluation mode to desactivate DropOut modules by default
         model.eval()
diff --git a/transformers/modeling_xlm.py b/transformers/modeling_xlm.py
index 35bada92af..40968ebe78 100644
--- a/transformers/modeling_xlm.py
+++ b/transformers/modeling_xlm.py
@@ -227,6 +227,16 @@ class XLMPreTrainedModel(PreTrainedModel):
     def __init__(self, *inputs, **kwargs):
         super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
 
+    @property
+    def dummy_inputs(self):
+        inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
+        attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        else:
+            langs_list = None
+        return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
+
     def _init_weights(self, module):
         """ Initialize the weights. """
         if isinstance(module, nn.Embedding):
@@ -336,7 +346,7 @@ class XLMModel(XLMPreTrainedModel):
 
         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
         model = XLMModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
@@ -624,7 +634,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
 
         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
         model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
@@ -698,7 +708,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
 
         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
         model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids, labels=labels)
         loss, logits = outputs[:2]
@@ -782,7 +792,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
 
         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
         model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         start_positions = torch.tensor([1])
         end_positions = torch.tensor([3])
         outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
@@ -878,7 +888,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
 
         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
         model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         start_positions = torch.tensor([1])
         end_positions = torch.tensor([3])
         outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
diff --git a/transformers/modeling_xlm_roberta.py b/transformers/modeling_xlm_roberta.py
new file mode 100644
index 0000000000..0bdce941a5
--- /dev/null
+++ b/transformers/modeling_xlm_roberta.py
@@ -0,0 +1,298 @@
+# coding=utf-8
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch XLM-RoBERTa model. """
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+
+from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification
+from .configuration_xlm_roberta import XLMRobertaConfig
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-pytorch_model.bin",
+    'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-pytorch_model.bin",
+    'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-pytorch_model.bin",
+    'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-pytorch_model.bin",
+    'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-pytorch_model.bin",
+    'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-pytorch_model.bin",
+}
+
+
+XLM_ROBERTA_START_DOCSTRING = r"""    The XLM-RoBERTa model was proposed in
+    `Unsupervised Cross-lingual Representation Learning at Scale`_
+    by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's RoBERTa model released in 2019.
+    
+    It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data.
+
+    This implementation is the same as RoBERTa.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Unsupervised Cross-lingual Representation Learning at Scale`:
+        https://arxiv.org/abs/1911.02116
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the 
+            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+XLM_ROBERTA_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, XLM-RoBERTa input sequence should be formatted with <s> and </s> tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         <s> Is this Jacksonville ? </s> </s> No it is not . </s>``
+
+            (b) For single sequences:
+
+                ``tokens:         <s> the dog is hairy . </s>``
+
+            Fully encoded sequences or sequence pairs can be obtained using the XLMRobertaTokenizer.encode function with 
+            the ``add_special_tokens`` parameter set to ``True``.
+
+            XLM-RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Optional segment token indices to indicate first and second portions of the inputs.
+            This embedding matrice is not trained (not pretrained during XLM-RoBERTa pretraining), you will have to train it
+            during finetuning.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+"""
+
+@add_start_docstrings("The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+                      XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+class XLMRobertaModel(RobertaModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            eo match pre-training, XLM-RoBERTa input sequence should be formatted with <s> and </s> tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         <s> is this jack ##son ##ville ? </s> </s> no it is not . </s>``
+
+                ``token_type_ids:   0   0  0    0    0     0       0   0   0     1  1  1  1   1   1``
+
+            (b) For single sequences:
+
+                ``tokens:         <s> the dog is hairy . </s>``
+
+                ``token_type_ids:   0   0   0   0  0     0   0``
+
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
+        model = XLMRobertaModel.from_pretrained('xlm-roberta-large')
+        input_ids = torch.tensor(tokenizer.encode("Schloß Nymphenburg ist sehr schön .")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    config_class = XLMRobertaConfig
+    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@add_start_docstrings("""XLM-RoBERTa Model with a `language modeling` head on top. """,
+    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+class XLMRobertaForMaskedLM(RobertaForMaskedLM):
+    r"""
+        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
+        model = XLMRobertaForMaskedLM.from_pretrained('xlm-roberta-large')
+        input_ids = torch.tensor(tokenizer.encode("Schloß Nymphenburg ist sehr schön .")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, masked_lm_labels=input_ids)
+        loss, prediction_scores = outputs[:2]
+
+    """
+    config_class = XLMRobertaConfig
+    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@add_start_docstrings("""XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
+    on top of the pooled output) e.g. for GLUE tasks. """,
+    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+class XLMRobertaForSequenceClassification(RobertaForSequenceClassification):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
+        model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-large')
+        input_ids = torch.tensor(tokenizer.encode("Schloß Nymphenburg ist sehr schön .")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
+
+    """
+    config_class = XLMRobertaConfig
+    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@add_start_docstrings("""XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+class XLMRobertaForMultipleChoice(RobertaForMultipleChoice):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss.
+        **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above).
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
+        model = XLMRobertaForMultipleChoice.from_pretrained('xlm-roberta-large')
+        choices = ["Schloß Nymphenburg ist sehr schön .", "Der Schloßkanal auch !"]
+        input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, classification_scores = outputs[:2]
+
+    """
+    config_class = XLMRobertaConfig
+    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@add_start_docstrings("""XLM-RoBERTa Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+class XLMRobertaForTokenClassification(RobertaForTokenClassification):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss.
+        **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
+        model = XLMRobertaForTokenClassification.from_pretrained('xlm-roberta-large')
+        input_ids = torch.tensor(tokenizer.encode("Schloß Nymphenburg ist sehr schön .", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
+
+    """
+    config_class = XLMRobertaConfig
+    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py
index 26b95076cd..2a51bf932b 100644
--- a/transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -589,7 +589,7 @@ class XLNetModel(XLNetPreTrainedModel):
 
         tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
         model = XLNetModel.from_pretrained('xlnet-large-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
@@ -609,7 +609,7 @@ class XLNetModel(XLNetPreTrainedModel):
         self.clamp_len = config.clamp_len
         self.n_layer = config.n_layer
 
-        self.word_embedding = nn.Embedding(config.n_token, config.d_model)
+        self.word_embedding = nn.Embedding(config.vocab_size, config.d_model)
         self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model))
         self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
         self.dropout = nn.Dropout(config.dropout)
@@ -925,7 +925,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
         model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
         # We show how to setup inputs to predict a next token using a bi-directional context.
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0)  # We will predict the masked token
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True)).unsqueeze(0)  # We will predict the masked token
         perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
         perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
         target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
@@ -940,7 +940,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         self.same_length = config.same_length
 
         self.transformer = XLNetModel(config)
-        self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
+        self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True)
 
         self.init_weights()
 
@@ -1031,7 +1031,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 
         tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
         model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids, labels=labels)
         loss, logits = outputs[:2]
@@ -1318,7 +1318,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
 
         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
         model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         start_positions = torch.tensor([1])
         end_positions = torch.tensor([3])
         outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
@@ -1433,7 +1433,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
 
         tokenizer =  XLNetTokenizer.from_pretrained('xlnet-large-cased')
         model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         start_positions = torch.tensor([1])
         end_positions = torch.tensor([3])
         outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
diff --git a/transformers/pipelines.py b/transformers/pipelines.py
new file mode 100755
index 0000000000..f4bf3da685
--- /dev/null
+++ b/transformers/pipelines.py
@@ -0,0 +1,907 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import sys
+import csv
+import json
+import os
+import pickle
+import logging
+import six
+
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from itertools import groupby
+from os.path import abspath, exists
+from typing import Union, Optional, Tuple, List, Dict
+
+import numpy as np
+
+from transformers import (AutoConfig, AutoTokenizer, PreTrainedTokenizer,
+                          PretrainedConfig, ModelCard, SquadExample,
+                          squad_convert_examples_to_features, is_tf_available,
+                          is_torch_available, BasicTokenizer,
+                          ALL_PRETRAINED_CONFIG_ARCHIVE_MAP)
+
+if is_tf_available():
+    import tensorflow as tf
+    from transformers import TFAutoModel, TFAutoModelForSequenceClassification, \
+        TFAutoModelForQuestionAnswering, TFAutoModelForTokenClassification
+
+if is_torch_available():
+    import torch
+    from transformers import AutoModel, AutoModelForSequenceClassification, \
+        AutoModelForQuestionAnswering, AutoModelForTokenClassification
+
+
+logger = logging.getLogger(__name__)
+
+def get_framework(model=None):
+    """ Select framework (TensorFlow/PyTorch) to use.
+        If both frameworks are installed and no specific model is provided, defaults to using PyTorch.
+    """
+    if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str):
+        # Both framework are available but the use supplied a model class instance.
+        # Try to guess which framework to use from the model classname
+        framework = 'tf' if model.__class__.__name__.startswith('TF') else 'pt'
+    elif not is_tf_available() and not is_torch_available():
+        raise ImportError("At least one of TensorFlow 2.0 or PyTorch should be installed. "
+                          "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
+                          "To install PyTorch, read the instructions at https://pytorch.org/.")
+    else:
+        # framework = 'tf' if is_tf_available() else 'pt'
+        framework = 'pt' if is_torch_available() else 'tf'
+    return framework
+
+class ArgumentHandler(ABC):
+    """
+    Base interface for handling varargs for each Pipeline
+    """
+    @abstractmethod
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError()
+
+
+class DefaultArgumentHandler(ArgumentHandler):
+    """
+    Default varargs argument parser handling parameters for each Pipeline
+    """
+    def __call__(self, *args, **kwargs):
+        if 'X' in kwargs:
+            return kwargs['X']
+        elif 'data' in kwargs:
+            return kwargs['data']
+        elif len(args) == 1:
+            if isinstance(args[0], list):
+                return args[0]
+            else:
+                return [args[0]]
+        elif len(args) > 1:
+            return list(args)
+        raise ValueError('Unable to infer the format of the provided data (X=, data=, ...)')
+
+
+class PipelineDataFormat:
+    """
+    Base class for all the pipeline supported data format both for reading and writing.
+    Supported data formats currently includes:
+     - JSON
+     - CSV
+     - stdin/stdout (pipe)
+
+    PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns
+    to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
+    """
+    SUPPORTED_FORMATS = ['json', 'csv', 'pipe']
+
+    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
+        self.output_path = output_path
+        self.input_path = input_path
+        self.column = column.split(',') if column is not None else ['']
+        self.is_multi_columns = len(self.column) > 1
+
+        if self.is_multi_columns:
+            self.column = [tuple(c.split('=')) if '=' in c else (c, c) for c in self.column]
+
+        if output_path is not None and not overwrite:
+            if exists(abspath(self.output_path)):
+                raise OSError('{} already exists on disk'.format(self.output_path))
+
+        if input_path is not None:
+            if not exists(abspath(self.input_path)):
+                raise OSError('{} doesnt exist on disk'.format(self.input_path))
+
+    @abstractmethod
+    def __iter__(self):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def save(self, data: dict):
+        """
+        Save the provided data object with the representation for the current `DataFormat`.
+        :param data: data to store
+        :return:
+        """
+        raise NotImplementedError()
+
+    def save_binary(self, data: Union[dict, List[dict]]) -> str:
+        """
+        Save the provided data object as a pickle-formatted binary data on the disk.
+        :param data: data to store
+        :return: (str) Path where the data has been saved
+        """
+        path, _ = os.path.splitext(self.output_path)
+        binary_path = os.path.extsep.join((path, 'pickle'))
+
+        with open(binary_path, 'wb+') as f_output:
+            pickle.dump(data, f_output)
+
+        return binary_path
+
+    @staticmethod
+    def from_str(format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
+        if format == 'json':
+            return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+        elif format == 'csv':
+            return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+        elif format == 'pipe':
+            return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+        else:
+            raise KeyError('Unknown reader {} (Available reader are json/csv/pipe)'.format(format))
+
+
+class CsvPipelineDataFormat(PipelineDataFormat):
+    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
+        super().__init__(output_path, input_path, column, overwrite=overwrite)
+
+    def __iter__(self):
+        with open(self.input_path, 'r') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                if self.is_multi_columns:
+                    yield {k: row[c] for k, c in self.column}
+                else:
+                    yield row[self.column[0]]
+
+    def save(self, data: List[dict]):
+        with open(self.output_path, 'w') as f:
+            if len(data) > 0:
+                writer = csv.DictWriter(f, list(data[0].keys()))
+                writer.writeheader()
+                writer.writerows(data)
+
+
+class JsonPipelineDataFormat(PipelineDataFormat):
+    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
+        super().__init__(output_path, input_path, column, overwrite=overwrite)
+
+        with open(input_path, 'r') as f:
+            self._entries = json.load(f)
+
+    def __iter__(self):
+        for entry in self._entries:
+            if self.is_multi_columns:
+                yield {k: entry[c] for k, c in self.column}
+            else:
+                yield entry[self.column[0]]
+
+    def save(self, data: dict):
+        with open(self.output_path, 'w') as f:
+            json.dump(data, f)
+
+
+class PipedPipelineDataFormat(PipelineDataFormat):
+    """
+    Read data from piped input to the python process.
+    For multi columns data, columns should separated by \t
+
+    If columns are provided, then the output will be a dictionary with {column_x: value_x}
+    """
+    def __iter__(self):
+        for line in sys.stdin:
+            # Split for multi-columns
+            if '\t' in line:
+
+                line = line.split('\t')
+                if self.column:
+                    # Dictionary to map arguments
+                    yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
+                else:
+                    yield tuple(line)
+
+            # No dictionary to map arguments
+            else:
+                yield line
+
+    def save(self, data: dict):
+        print(data)
+
+    def save_binary(self, data: Union[dict, List[dict]]) -> str:
+        if self.output_path is None:
+            raise KeyError(
+                'When using piped input on pipeline outputting large object requires an output file path. '
+                'Please provide such output path through --output argument.'
+            )
+
+        return super().save_binary(data)
+
+
+class _ScikitCompat(ABC):
+    """
+    Interface layer for the Scikit and Keras compatibility.
+    """
+
+    @abstractmethod
+    def transform(self, X):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def predict(self, X):
+        raise NotImplementedError()
+
+
+class Pipeline(_ScikitCompat):
+    """
+    Base class implementing pipelined operations.
+    Pipeline workflow is defined as a sequence of the following operations:
+        Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output
+
+    Pipeline supports running on CPU or GPU through the device argument. Users can specify
+    device argument as an integer, -1 meaning "CPU", >= 0 referring the CUDA device ordinal.
+
+    Some pipeline, like for instance FeatureExtractionPipeline ('feature-extraction') outputs large
+    tensor object as nested-lists. In order to avoid dumping such large structure as textual data we
+    provide the binary_output constructor argument. If set to True, the output will be stored in the
+    pickle format.
+
+    Arguments:
+        **model**: ``(str, PretrainedModel, TFPretrainedModel)``:
+            Reference to the model to use through this pipeline.
+
+        **tokenizer**: ``(str, PreTrainedTokenizer)``:
+            Reference to the tokenizer to use through this pipeline.
+
+        **args_parser**: ``ArgumentHandler``:
+            Reference to the object in charge of parsing supplied pipeline parameters.
+
+        **device**: ``int``:
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
+            on the associated CUDA device id.
+
+        **binary_output** ``bool`` (default: False):
+            Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text.
+
+    Return:
+        Pipeline returns list or dictionary depending on:
+         - Does the user provided multiple sample
+         - The pipeline expose multiple fields in the output object
+
+    Examples:
+        nlp = pipeline('ner')
+        nlp = pipeline('ner', model='...', config='...', tokenizer='...')
+        nlp = NerPipeline(model='...', config='...', tokenizer='...')
+        nlp = QuestionAnsweringPipeline(model=AutoModel.from_pretrained('...'), tokenizer='...')
+    """
+
+    default_input_names = None
+
+    def __init__(self, model, tokenizer: PreTrainedTokenizer = None,
+                 modelcard: ModelCard = None, framework: Optional[str] = None,
+                 args_parser: ArgumentHandler = None, device: int = -1,
+                 binary_output: bool = False):
+
+        if framework is None:
+            framework = get_framework()
+
+        self.model = model
+        self.tokenizer = tokenizer
+        self.modelcard = modelcard
+        self.framework = framework
+        self.device = device
+        self.binary_output = binary_output
+        self._args_parser = args_parser or DefaultArgumentHandler()
+
+        # Special handling
+        if self.device >= 0 and self.framework == 'pt':
+            self.model = self.model.to('cuda:{}'.format(self.device))
+
+    def save_pretrained(self, save_directory):
+        """
+        Save the pipeline's model and tokenizer to the specified save_directory
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Provided path ({}) should be a directory".format(save_directory))
+            return
+
+        self.model.save_pretrained(save_directory)
+        self.tokenizer.save_pretrained(save_directory)
+        self.modelcard.save_pretrained(save_directory)
+
+    def transform(self, X):
+        """
+        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
+        """
+        return self(X=X)
+
+    def predict(self, X):
+        """
+        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
+        Se
+        """
+        return self(X=X)
+
+    @contextmanager
+    def device_placement(self):
+        """
+        Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.
+        example:
+            # Explicitly ask for tensor allocation on CUDA device :0
+            nlp = pipeline(..., device=0)
+            with nlp.device_placement():
+                # Every framework specific tensor allocation will be done on the request device
+                output = nlp(...)
+        Returns:
+            Context manager
+        """
+        if self.framework == 'tf':
+            with tf.device('/CPU:0' if self.device == -1 else '/device:GPU:{}'.format(self.device)):
+                yield
+        else:
+            if self.device >= 0:
+                torch.cuda.set_device(self.device)
+
+            yield
+
+    def inputs_for_model(self, features: Union[dict, List[dict]]) -> Dict:
+        """
+        Generates the input dictionary with model-specific parameters.
+
+        Returns:
+            dict holding all the required parameters for model's forward
+        """
+        args = ['input_ids', 'attention_mask']
+        model_type = type(self.model).__name__.lower()
+
+        if 'distilbert' not in model_type and 'xlm' not in model_type:
+            args += ['token_type_ids']
+
+        # PR #1548 (CLI) There is an issue with attention_mask
+        # if 'xlnet' in model_type or 'xlm' in model_type:
+        #     args += ['cls_index', 'p_mask']
+
+        if isinstance(features, dict):
+            return {k: features[k] for k in args}
+        else:
+            return {k: [feature[k] for feature in features] for k in args}
+
+    def __call__(self, *texts, **kwargs):
+        # Parse arguments
+        inputs = self._args_parser(*texts, **kwargs)
+
+        # Encode for forward
+        with self.device_placement():
+            inputs = self.tokenizer.batch_encode_plus(
+                inputs, add_special_tokens=True,
+                return_tensors=self.framework,
+                max_length=self.tokenizer.max_len
+            )
+
+            # Filter out features not available on specific models
+            inputs = self.inputs_for_model(inputs)
+            return self._forward(inputs)
+
+    def _forward(self, inputs):
+        """
+        Internal framework specific forward dispatching.
+        Args:
+            inputs: dict holding all the keyworded arguments for required by the model forward method.
+        Returns:
+            Numpy array
+        """
+        if self.framework == 'tf':
+            # TODO trace model
+            predictions = self.model(inputs, training=False)[0]
+        else:
+            with torch.no_grad():
+                predictions = self.model(**inputs)[0].cpu()
+
+        return predictions.numpy()
+
+
+class FeatureExtractionPipeline(Pipeline):
+    """
+    Feature extraction pipeline using Model head.
+    """
+
+    def __init__(self, model,
+                 tokenizer: PreTrainedTokenizer = None,
+                 modelcard: ModelCard = None,
+                 framework: Optional[str] = None,
+                 args_parser: ArgumentHandler = None,
+                 device: int = -1):
+        super().__init__(model=model,
+                         tokenizer=tokenizer,
+                         modelcard=modelcard,
+                         framework=framework,
+                         args_parser=args_parser,
+                         device=device,
+                         binary_output=True)
+
+    def __call__(self, *args, **kwargs):
+        return super().__call__(*args, **kwargs).tolist()
+
+
+class TextClassificationPipeline(Pipeline):
+    """
+    Text classification pipeline using ModelForTextClassification head.
+    """
+
+    def __call__(self, *args, **kwargs):
+        outputs = super().__call__(*args, **kwargs)
+        scores = np.exp(outputs) / np.exp(outputs).sum(-1)
+        return [{'label': self.model.config.id2label[item.argmax()], 'score': item.max()} for item in scores]
+
+
+class NerPipeline(Pipeline):
+    """
+    Named Entity Recognition pipeline using ModelForTokenClassification head.
+    """
+
+    default_input_names = 'sequences'
+
+    def __init__(self, model, tokenizer: PreTrainedTokenizer = None,
+                 modelcard: ModelCard = None, framework: Optional[str] = None,
+                 args_parser: ArgumentHandler = None, device: int = -1,
+                 binary_output: bool = False, ignore_labels=['O']):
+        super().__init__(model=model,
+                         tokenizer=tokenizer,
+                         modelcard=modelcard,
+                         framework=framework,
+                         args_parser=args_parser,
+                         device=device,
+                         binary_output=binary_output)
+
+        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
+        self.ignore_labels = ignore_labels
+
+    def __call__(self, *texts, **kwargs):
+        inputs, answers = self._args_parser(*texts, **kwargs), []
+        for sentence in inputs:
+
+            # Manage correct placement of the tensors
+            with self.device_placement():
+
+                tokens = self.tokenizer.encode_plus(
+                    sentence, return_attention_mask=False,
+                    return_tensors=self.framework,
+                    max_length=self.tokenizer.max_len
+                )
+
+                # Forward
+                if self.framework == 'tf':
+                    entities = self.model(tokens)[0][0].numpy()
+                    input_ids = tokens['input_ids'].numpy()[0]
+                else:
+                    with torch.no_grad():
+                        entities = self.model(**tokens)[0][0].cpu().numpy()
+                        input_ids = tokens['input_ids'].cpu().numpy()[0]
+
+            score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
+            labels_idx = score.argmax(axis=-1)
+
+            answer = []
+            for idx, label_idx in enumerate(labels_idx):
+                if self.model.config.id2label[label_idx] not in self.ignore_labels:
+                    answer += [{
+                        'word': self.tokenizer.decode([int(input_ids[idx])]),
+                        'score': score[idx][label_idx].item(),
+                        'entity': self.model.config.id2label[label_idx]
+                    }]
+
+            # Append
+            answers += [answer]
+        if len(answers) == 1:
+            return answers[0]
+        return answers
+
+
+class QuestionAnsweringArgumentHandler(ArgumentHandler):
+    """
+    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped
+    to internal SquadExample / SquadFeature structures.
+
+    QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied
+    arguments.
+    """
+    def __call__(self, *args, **kwargs):
+        # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating
+        if args is not None and len(args) > 0:
+            if len(args) == 1:
+                kwargs['X'] = args[0]
+            else:
+                kwargs['X'] = list(args)
+
+        # Generic compatibility with sklearn and Keras
+        # Batched data
+        if 'X' in kwargs or 'data' in kwargs:
+            inputs = kwargs['X'] if 'X' in kwargs else kwargs['data']
+
+            if isinstance(inputs, dict):
+                inputs = [inputs]
+            else:
+                # Copy to avoid overriding arguments
+                inputs = [i for i in inputs]
+
+            for i, item in enumerate(inputs):
+                if isinstance(item, dict):
+                    if any(k not in item for k in ['question', 'context']):
+                        raise KeyError('You need to provide a dictionary with keys {question:..., context:...}')
+
+                    inputs[i] = QuestionAnsweringPipeline.create_sample(**item)
+
+                elif not isinstance(item, SquadExample):
+                    raise ValueError(
+                        '{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)'
+                            .format('X' if 'X' in kwargs else 'data')
+                    )
+
+            # Tabular input
+        elif 'question' in kwargs and 'context' in kwargs:
+            if isinstance(kwargs['question'], str):
+                kwargs['question'] = [kwargs['question']]
+
+            if isinstance(kwargs['context'], str):
+                kwargs['context'] = [kwargs['context']]
+
+            inputs = [QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs['question'], kwargs['context'])]
+        else:
+            raise ValueError('Unknown arguments {}'.format(kwargs))
+
+        if not isinstance(inputs, list):
+            inputs = [inputs]
+
+        return inputs
+
+
+class QuestionAnsweringPipeline(Pipeline):
+    """
+    Question Answering pipeline using ModelForQuestionAnswering head.
+    """
+
+    default_input_names = 'question,context'
+
+    def __init__(self, model,
+                 tokenizer: Optional[PreTrainedTokenizer],
+                 modelcard: Optional[ModelCard],
+                 framework: Optional[str] = None,
+                 device: int = -1, **kwargs):
+        super().__init__(model=model,
+                         tokenizer=tokenizer,
+                         modelcard=modelcard,
+                         framework=framework,
+                         args_parser=QuestionAnsweringArgumentHandler(),
+                         device=device, **kwargs)
+
+    @staticmethod
+    def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[SquadExample, List[SquadExample]]:
+        """
+        QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally.
+        This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s).
+        We currently support extractive question answering.
+        Arguments:
+             question: (str, List[str]) The question to be ask for the associated context
+             context: (str, List[str]) The context in which we will look for the answer.
+
+        Returns:
+            SquadExample initialized with the corresponding question and context.
+        """
+        if isinstance(question, list):
+            return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
+        else:
+            return SquadExample(None, question, context, None, None, None)
+
+    def __call__(self, *texts, **kwargs):
+        """
+        Args:
+            We support multiple use-cases, the following are exclusive:
+            X: sequence of SquadExample
+            data: sequence of SquadExample
+            question: (str, List[str]), batch of question(s) to map along with context
+            context: (str, List[str]), batch of context(s) associated with the provided question keyword argument
+        Returns:
+            dict: {'answer': str, 'score": float, 'start": int, "end": int}
+            answer: the textual answer in the intial context
+            score: the score the current answer scored for the model
+            start: the character index in the original string corresponding to the beginning of the answer' span
+            end: the character index in the original string corresponding to the ending of the answer' span
+        """
+        # Set defaults values
+        kwargs.setdefault('topk', 1)
+        kwargs.setdefault('doc_stride', 128)
+        kwargs.setdefault('max_answer_len', 15)
+        kwargs.setdefault('max_seq_len', 384)
+        kwargs.setdefault('max_question_len', 64)
+
+        if kwargs['topk'] < 1:
+            raise ValueError('topk parameter should be >= 1 (got {})'.format(kwargs['topk']))
+
+        if kwargs['max_answer_len'] < 1:
+            raise ValueError('max_answer_len parameter should be >= 1 (got {})'.format(kwargs['max_answer_len']))
+
+        # Convert inputs to features
+        examples = self._args_parser(*texts, **kwargs)
+        features = squad_convert_examples_to_features(examples, self.tokenizer, kwargs['max_seq_len'], kwargs['doc_stride'], kwargs['max_question_len'], False)
+        fw_args = self.inputs_for_model([f.__dict__ for f in features])
+
+        # Manage tensor allocation on correct device
+        with self.device_placement():
+            if self.framework == 'tf':
+                fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
+                start, end = self.model(fw_args)
+                start, end = start.numpy(), end.numpy()
+            else:
+                with torch.no_grad():
+                    # Retrieve the score for the context tokens only (removing question tokens)
+                    fw_args = {k: torch.tensor(v) for (k, v) in fw_args.items()}
+                    start, end = self.model(**fw_args)
+                    start, end = start.cpu().numpy(), end.cpu().numpy()
+
+        answers = []
+        for (example, feature, start_, end_) in zip(examples, features, start, end):
+            # Normalize logits and spans to retrieve the answer
+            start_ = np.exp(start_) / np.sum(np.exp(start_))
+            end_ = np.exp(end_) / np.sum(np.exp(end_))
+
+            # Mask padding and question
+            start_, end_ = start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1)
+
+            # TODO : What happens if not possible
+            # Mask CLS
+            start_[0] = end_[0] = 0
+
+            starts, ends, scores = self.decode(start_, end_, kwargs['topk'], kwargs['max_answer_len'])
+            char_to_word = np.array(example.char_to_word_offset)
+
+            # Convert the answer (tokens) back to the original text
+            answers += [
+                {
+                    'score': score.item(),
+                    'start': np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
+                    'end': np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
+                    'answer': ' '.join(example.doc_tokens[feature.token_to_orig_map[s]:feature.token_to_orig_map[e] + 1])
+                }
+                for s, e, score in zip(starts, ends, scores)
+            ]
+        if len(answers) == 1:
+            return answers[0]
+        return answers
+
+    def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
+        """
+        Take the output of any QuestionAnswering head and will generate probalities for each span to be
+        the actual answer.
+        In addition, it filters out some unwanted/impossible cases like answer len being greater than
+        max_answer_len or answer end position being before the starting position.
+        The method supports output the k-best answer through the topk argument.
+
+        Args:
+            start: numpy array, holding individual start probabilities for each token
+            end: numpy array, holding individual end probabilities for each token
+            topk: int, indicates how many possible answer span(s) to extract from the model's output
+            max_answer_len: int, maximum size of the answer to extract from the model's output
+        """
+        # Ensure we have batch axis
+        if start.ndim == 1:
+            start = start[None]
+
+        if end.ndim == 1:
+            end = end[None]
+
+        # Compute the score of each tuple(start, end) to be the real answer
+        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
+
+        # Remove candidate with end < start and end - start > max_answer_len
+        candidates = np.tril(np.triu(outer), max_answer_len - 1)
+
+        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
+        scores_flat = candidates.flatten()
+        if topk == 1:
+            idx_sort = [np.argmax(scores_flat)]
+        elif len(scores_flat) < topk:
+            idx_sort = np.argsort(-scores_flat)
+        else:
+            idx = np.argpartition(-scores_flat, topk)[0:topk]
+            idx_sort = idx[np.argsort(-scores_flat[idx])]
+
+        start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
+        return start, end, candidates[0, start, end]
+
+    def span_to_answer(self, text: str, start: int, end: int):
+        """
+        When decoding from token probalities, this method maps token indexes to actual word in
+        the initial context.
+
+        Args:
+            text: str, the actual context to extract the answer from
+            start: int, starting answer token index
+            end: int, ending answer token index
+
+        Returns:
+            dict: {'answer': str, 'start': int, 'end': int}
+        """
+        words = []
+        token_idx = char_start_idx = char_end_idx = chars_idx = 0
+
+        for i, word in enumerate(text.split(" ")):
+            token = self.tokenizer.tokenize(word)
+
+            # Append words if they are in the span
+            if start <= token_idx <= end:
+                if token_idx == start:
+                    char_start_idx = chars_idx
+
+                if token_idx == end:
+                    char_end_idx = chars_idx + len(word)
+
+                words += [word]
+
+            # Stop if we went over the end of the answer
+            if token_idx > end:
+                break
+
+            # Append the subtokenization length to the running index
+            token_idx += len(token)
+            chars_idx += len(word) + 1
+
+        # Join text with spaces
+        return {'answer': ' '.join(words), 'start': max(0, char_start_idx), 'end': min(len(text), char_end_idx)}
+
+
+# Register all the supported task here
+SUPPORTED_TASKS = {
+    'feature-extraction': {
+        'impl': FeatureExtractionPipeline,
+        'tf': TFAutoModel if is_tf_available() else None,
+        'pt': AutoModel if is_torch_available() else None,
+        'default': {
+            'model': {
+                'pt': 'distilbert-base-uncased',
+                'tf': 'distilbert-base-uncased',
+            },
+            'config': None,
+            'tokenizer': 'distilbert-base-uncased'
+        }
+    },
+    'sentiment-analysis': {
+        'impl': TextClassificationPipeline,
+        'tf': TFAutoModelForSequenceClassification if is_tf_available() else None,
+        'pt': AutoModelForSequenceClassification if is_torch_available() else None,
+        'default': {
+            'model': {
+                'pt': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin',
+                'tf': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5',
+            },
+            'config': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json',
+            'tokenizer': 'distilbert-base-uncased'
+        }
+    },
+    'ner': {
+        'impl': NerPipeline,
+        'tf': TFAutoModelForTokenClassification if is_tf_available() else None,
+        'pt': AutoModelForTokenClassification if is_torch_available() else None,
+        'default': {
+            'model': {
+                'pt':'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin',
+                'tf': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5',
+            },
+            'config': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json',
+            'tokenizer': 'bert-large-cased'
+        }
+    },
+    'question-answering': {
+        'impl': QuestionAnsweringPipeline,
+        'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None,
+        'pt': AutoModelForQuestionAnswering if is_torch_available() else None,
+        'default': {
+            'model': {
+                'pt': 'distilbert-base-uncased-distilled-squad',
+                'tf': 'distilbert-base-uncased-distilled-squad',
+            },
+            'config': None,
+            'tokenizer': 'distilbert-base-uncased'
+        }
+    }
+}
+
+
+def pipeline(task: str, model: Optional = None,
+             config: Optional[Union[str, PretrainedConfig]] = None,
+             tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
+             modelcard: Optional[Union[str, ModelCard]] = None,
+             **kwargs) -> Pipeline:
+    """
+    Utility factory method to build a pipeline.
+    Pipeline are made of:
+        A Tokenizer instance in charge of mapping raw textual input to token
+        A Model instance
+        Some (optional) post processing for enhancing model's output
+
+    Examples:
+        pipeline('sentiment-analysis')
+        pipeline('question-answering', model='distilbert-base-uncased-distilled-squad', tokenizer='bert-base-cased')
+        pipeline('ner', model=AutoModel.from_pretrained(...), tokenizer=AutoTokenizer.from_pretrained(...)
+        pipeline('ner', model='https://...pytorch-model.bin', config='https://...config.json', tokenizer='bert-base-cased')
+    """
+    # Retrieve the task
+    if task not in SUPPORTED_TASKS:
+        raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys())))
+
+    framework = get_framework(model)
+
+    targeted_task = SUPPORTED_TASKS[task]
+    task, model_class = targeted_task['impl'], targeted_task[framework]
+
+    # Use default model/config/tokenizer for the task if no model is provided
+    if model is None:
+        models, config, tokenizer = tuple(targeted_task['default'].values())
+        model = models[framework]
+
+    # Try to infer tokenizer from model or config name (if provided as str)
+    if tokenizer is None:
+        if isinstance(model, str) and model in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
+            tokenizer = model
+        elif isinstance(config, str) and config in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
+            tokenizer = config
+        else:
+            # Impossible to guest what is the right tokenizer here
+            raise Exception("Impossible to guess which tokenizer to use. "
+                            "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer.")
+
+    # Try to infer modelcard from model or config name (if provided as str)
+    if modelcard is None:
+        # Try to fallback on one of the provided string for model or config (will replace the suffix)
+        if isinstance(model, str):
+            modelcard = model
+        elif isinstance(config, str):
+            modelcard = config
+
+    # Instantiate tokenizer if needed
+    if isinstance(tokenizer, six.string_types):
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+
+    # Instantiate config if needed
+    if isinstance(config, str):
+        config = AutoConfig.from_pretrained(config)
+
+    # Instantiate modelcard if needed
+    if isinstance(modelcard, str):
+        modelcard = ModelCard.from_pretrained(modelcard)
+
+    # Instantiate model if needed
+    if isinstance(model, str):
+        # Handle transparent TF/PT model conversion
+        model_kwargs = {}
+        if framework == 'pt' and model.endswith('.h5'):
+            model_kwargs['from_tf'] = True
+            logger.warning('Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. '
+                           'Trying to load the model with PyTorch.')
+        elif framework == 'tf' and model.endswith('.bin'):
+            model_kwargs['from_pt'] = True
+            logger.warning('Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. '
+                           'Trying to load the model with Tensorflow.')
+        model = model_class.from_pretrained(model, config=config, **model_kwargs)
+
+    return task(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, **kwargs)
diff --git a/transformers/tests/configuration_common_test.py b/transformers/tests/configuration_common_test.py
index 8ee751153c..376d110d3c 100644
--- a/transformers/tests/configuration_common_test.py
+++ b/transformers/tests/configuration_common_test.py
@@ -16,15 +16,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
 import os
-import shutil
 import json
-import random
-import uuid
+import tempfile
 
 import unittest
-import logging
+from .tokenization_tests_commons import TemporaryDirectory
 
 
 class ConfigTester(object):
@@ -48,16 +45,28 @@ class ConfigTester(object):
 
     def create_and_test_config_to_json_file(self):
         config_first = self.config_class(**self.inputs_dict)
-        json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
-        config_first.to_json_file(json_file_path)
-        config_second = self.config_class.from_json_file(json_file_path)
-        os.remove(json_file_path)
+
+        with TemporaryDirectory() as tmpdirname:
+            json_file_path = os.path.join(tmpdirname, "config.json")
+            config_first.to_json_file(json_file_path)
+            config_second = self.config_class.from_json_file(json_file_path)
+
+        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
+
+    def create_and_test_config_from_and_save_pretrained(self):
+        config_first = self.config_class(**self.inputs_dict)
+
+        with TemporaryDirectory() as tmpdirname:
+            config_first.save_pretrained(tmpdirname)
+            config_second = self.config_class.from_pretrained(tmpdirname)
+
         self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
 
     def run_common_tests(self):
         self.create_and_test_config_common_properties()
         self.create_and_test_config_to_json_string()
         self.create_and_test_config_to_json_file()
+        self.create_and_test_config_from_and_save_pretrained()
 
 if __name__ == "__main__":
     unittest.main()
\ No newline at end of file
diff --git a/transformers/tests/fixtures/empty.txt b/transformers/tests/fixtures/empty.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/transformers/tests/hf_api_test.py b/transformers/tests/hf_api_test.py
index 92d41b6dff..b45f5aceed 100644
--- a/transformers/tests/hf_api_test.py
+++ b/transformers/tests/hf_api_test.py
@@ -15,18 +15,30 @@
 from __future__ import absolute_import, division, print_function
 
 import os
-import six
 import time
 import unittest
 
-from transformers.hf_api import HfApi, S3Obj, PresignedUrl, HfFolder, HTTPError
+import requests
+import six
+
+from transformers.hf_api import HfApi, HfFolder, HTTPError, PresignedUrl, S3Obj
 
 USER = "__DUMMY_TRANSFORMERS_USER__"
 PASS = "__DUMMY_TRANSFORMERS_PASS__"
-FILE_KEY = "Test-{}.txt".format(int(time.time()))
-FILE_PATH = os.path.join(
-    os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
-)
+FILES = [
+    (
+        "Test-{}.txt".format(int(time.time())),
+        os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
+        )
+    ),
+    (
+        "yoyo {}.txt".format(int(time.time())), # space is intentional
+        os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt"
+        )
+    ),
+]
 
 
 
@@ -57,15 +69,21 @@ class HfApiEndpointsTest(HfApiCommonTest):
         self.assertEqual(user, USER)
 
     def test_presign(self):
-        urls = self._api.presign(token=self._token, filename=FILE_KEY)
-        self.assertIsInstance(urls, PresignedUrl)
-        self.assertEqual(urls.type, "text/plain")
+        for FILE_KEY, FILE_PATH in FILES:
+            urls = self._api.presign(token=self._token, filename=FILE_KEY)
+            self.assertIsInstance(urls, PresignedUrl)
+            self.assertEqual(urls.type, "text/plain")
 
     def test_presign_and_upload(self):
-        access_url = self._api.presign_and_upload(
-            token=self._token, filename=FILE_KEY, filepath=FILE_PATH
-        )
-        self.assertIsInstance(access_url, six.string_types)
+        for FILE_KEY, FILE_PATH in FILES:
+            access_url = self._api.presign_and_upload(
+                token=self._token, filename=FILE_KEY, filepath=FILE_PATH
+            )
+            self.assertIsInstance(access_url, six.string_types)
+            with open(FILE_PATH, 'r') as f:
+                body = f.read()
+            r = requests.get(access_url)
+            self.assertEqual(r.text, body)
 
     def test_list_objs(self):
         objs = self._api.list_objs(token=self._token)
diff --git a/transformers/tests/model_card_test.py b/transformers/tests/model_card_test.py
new file mode 100644
index 0000000000..b293b5726a
--- /dev/null
+++ b/transformers/tests/model_card_test.py
@@ -0,0 +1,89 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import json
+import unittest
+
+from transformers.modelcard import ModelCard
+from .tokenization_tests_commons import TemporaryDirectory
+
+class ModelCardTester(unittest.TestCase):
+
+    def setUp(self):
+        self.inputs_dict = {'model_details': {
+                                'Organization': 'testing',
+                                'Model date': 'today',
+                                'Model version': 'v2.1, Developed by Test Corp in 2019.',
+                                'Architecture': 'Convolutional Neural Network.',
+                                },
+                            'metrics': 'BLEU and ROUGE-1',
+                            'evaluation_data':{
+                                'Datasets':{
+                                    'BLEU': 'My-great-dataset-v1',
+                                    'ROUGE-1': 'My-short-dataset-v2.1',
+                                },
+                                'Preprocessing': 'See details on https://arxiv.org/pdf/1810.03993.pdf'
+                            },
+                            'training_data':{
+                                'Dataset': 'English Wikipedia dump dated 2018-12-01',
+                                'Preprocessing': 'Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf'
+                            },
+                            'quantitative_analyses': {
+                                'BLEU': 55.1,
+                                'ROUGE-1': 76,
+                            },
+                            }
+
+    def test_model_card_common_properties(self):
+        modelcard = ModelCard.from_dict(self.inputs_dict)
+        self.assertTrue(hasattr(modelcard, 'model_details'))
+        self.assertTrue(hasattr(modelcard, 'intended_use'))
+        self.assertTrue(hasattr(modelcard, 'factors'))
+        self.assertTrue(hasattr(modelcard, 'metrics'))
+        self.assertTrue(hasattr(modelcard, 'evaluation_data'))
+        self.assertTrue(hasattr(modelcard, 'training_data'))
+        self.assertTrue(hasattr(modelcard, 'quantitative_analyses'))
+        self.assertTrue(hasattr(modelcard, 'ethical_considerations'))
+        self.assertTrue(hasattr(modelcard, 'caveats_and_recommendations'))
+
+    def test_model_card_to_json_string(self):
+        modelcard = ModelCard.from_dict(self.inputs_dict)
+        obj = json.loads(modelcard.to_json_string())
+        for key, value in self.inputs_dict.items():
+            self.assertEqual(obj[key], value)
+
+    def test_model_card_to_json_file(self):
+        model_card_first = ModelCard.from_dict(self.inputs_dict)
+
+        with TemporaryDirectory() as tmpdirname:
+            filename = os.path.join(tmpdirname, u"modelcard.json")
+            model_card_first.to_json_file(filename)
+            model_card_second = ModelCard.from_json_file(filename)
+
+        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
+
+    def test_model_card_from_and_save_pretrained(self):
+        model_card_first = ModelCard.from_dict(self.inputs_dict)
+
+        with TemporaryDirectory() as tmpdirname:
+            model_card_first.save_pretrained(tmpdirname)
+            model_card_second = ModelCard.from_pretrained(tmpdirname)
+
+        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/tests/modeling_albert_test.py b/transformers/tests/modeling_albert_test.py
index a14d66ae8f..b726fd9278 100644
--- a/transformers/tests/modeling_albert_test.py
+++ b/transformers/tests/modeling_albert_test.py
@@ -17,13 +17,12 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import shutil
 
 from transformers import is_torch_available
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 if is_torch_available():
     from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM,
@@ -110,7 +109,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = AlbertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 hidden_size=self.hidden_size,
                 num_hidden_layers=self.num_hidden_layers,
                 num_attention_heads=self.num_attention_heads,
@@ -230,10 +229,8 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = AlbertModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = AlbertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 if __name__ == "__main__":
diff --git a/transformers/tests/modeling_auto_test.py b/transformers/tests/modeling_auto_test.py
index 9b7d920bc8..871a262fe8 100644
--- a/transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -22,7 +22,7 @@ import logging
 
 from transformers import is_torch_available
 
-from .utils import require_torch, slow
+from .utils import require_torch, slow, SMALL_MODEL_IDENTIFIER
 
 if is_torch_available():
     from transformers import (AutoConfig, BertConfig,
@@ -92,6 +92,11 @@ class AutoModelTest(unittest.TestCase):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, BertForQuestionAnswering)
 
+    def test_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(model, BertForMaskedLM)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py
index 539f66cd3f..a5adff8f68 100644
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -17,13 +17,12 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import shutil
 
 from transformers import is_torch_available
 
 from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 if is_torch_available():
     from transformers import (BertConfig, BertModel, BertForMaskedLM,
@@ -109,7 +108,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = BertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 hidden_size=self.hidden_size,
                 num_hidden_layers=self.num_hidden_layers,
                 num_attention_heads=self.num_attention_heads,
@@ -360,10 +359,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = BertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 
diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index 80d5d95455..2116651f4a 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -18,7 +18,7 @@ from __future__ import print_function
 
 import copy
 import sys
-import os
+import os.path
 import shutil
 import tempfile
 import json
@@ -30,7 +30,7 @@ import logging
 
 from transformers import is_torch_available
 
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 if is_torch_available():
     import torch
@@ -58,7 +58,7 @@ else:
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
     for key in configs_no_init.__dict__.keys():
-        if '_range' in key or '_std' in key:
+        if '_range' in key or '_std' in key or 'initializer_factor' in key:
             setattr(configs_no_init, key, 0.0)
     return configs_no_init
 
@@ -73,6 +73,7 @@ class CommonTestCases:
         test_pruning = True
         test_resize_embeddings = True
         test_head_masking = True
+        is_encoder_decoder = False
 
         def test_save_load(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -83,6 +84,8 @@ class CommonTestCases:
                 model.eval()
                 with torch.no_grad():
                     outputs = model(**inputs_dict)
+                out_2 = outputs[0].numpy()
+                out_2[np.isnan(out_2)] = 0
 
                 with TemporaryDirectory() as tmpdirname:
                     model.save_pretrained(tmpdirname)
@@ -93,9 +96,7 @@ class CommonTestCases:
 
                     # Make sure we don't have nans
                     out_1 = after_outputs[0].cpu().numpy()
-                    out_2 = outputs[0].cpu().numpy()
-                    out_1 = out_1[~np.isnan(out_1)]
-                    out_2 = out_2[~np.isnan(out_2)]
+                    out_1[np.isnan(out_1)] = 0
                     max_diff = np.amax(np.abs(out_1 - out_2))
                     self.assertLessEqual(max_diff, 1e-5)
 
@@ -117,20 +118,32 @@ class CommonTestCases:
                 model = model_class(config)
                 model.to(torch_device)
                 model.eval()
-                first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
-                self.assertEqual(first.ne(second).sum().item(), 0)
-
+                with torch.no_grad():
+                    first = model(**inputs_dict)[0]
+                    second = model(**inputs_dict)[0]
+                out_1 = first.cpu().numpy()
+                out_2 = second.cpu().numpy()
+                out_1 = out_1[~np.isnan(out_1)]
+                out_2 = out_2[~np.isnan(out_2)]
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
 
         def test_attention_outputs(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
+            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
+            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
+            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
+            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
+
             for model_class in self.all_model_classes:
                 config.output_attentions = True
                 config.output_hidden_states = False
                 model = model_class(config)
                 model.to(torch_device)
                 model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                 attentions = outputs[-1]
                 self.assertEqual(model.config.output_attentions, True)
                 self.assertEqual(model.config.output_hidden_states, False)
@@ -138,28 +151,42 @@ class CommonTestCases:
                 self.assertListEqual(
                     list(attentions[0].shape[-3:]),
                     [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_seq_length ,
+                    encoder_key_length])
                 out_len = len(outputs)
 
+                if self.is_encoder_decoder:
+                    self.assertEqual(out_len % 2, 0)
+                    decoder_attentions = outputs[(out_len // 2)-1]
+                    self.assertEqual(model.config.output_attentions, True)
+                    self.assertEqual(model.config.output_hidden_states, False)
+                    self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                    self.assertListEqual(
+                        list(decoder_attentions[0].shape[-3:]),
+                        [self.model_tester.num_attention_heads,
+                         decoder_seq_length,
+                         decoder_key_length
+                         ])
+
                 # Check attention is always last and order is fine
                 config.output_attentions = True
                 config.output_hidden_states = True
                 model = model_class(config)
                 model.to(torch_device)
                 model.eval()
-                outputs = model(**inputs_dict)
-                self.assertEqual(out_len+1, len(outputs))
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
+                self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
                 self.assertEqual(model.config.output_attentions, True)
                 self.assertEqual(model.config.output_hidden_states, True)
 
-                attentions = outputs[-1]
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+                self_attentions = outputs[-1]
+                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
                 self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
+                    list(self_attentions[0].shape[-3:]),
                     [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_seq_length,
+                    encoder_key_length])
 
         def test_torchscript(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -191,21 +218,22 @@ class CommonTestCases:
                 inputs = inputs_dict['input_ids']  # Let's keep only input_ids
 
                 try:
-                    torch.jit.trace(model, inputs)
+                    traced_gpt2 = torch.jit.trace(model, inputs)
                 except RuntimeError:
                     self.fail("Couldn't trace module.")
 
-                try:
-                    traced_gpt2 = torch.jit.trace(model, inputs)
-                    torch.jit.save(traced_gpt2, "traced_model.pt")
-                except RuntimeError:
-                    self.fail("Couldn't save module.")
+                with TemporaryDirectory() as tmp_dir_name:
+                    pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
 
-                try:
-                    loaded_model = torch.jit.load("traced_model.pt")
-                    os.remove("traced_model.pt")
-                except ValueError:
-                    self.fail("Couldn't load module.")
+                    try:
+                        torch.jit.save(traced_gpt2, pt_file_name)
+                    except Exception:
+                        self.fail("Couldn't save module.")
+
+                    try:
+                        loaded_model = torch.jit.load(pt_file_name)
+                    except Exception:
+                        self.fail("Couldn't load module.")
 
                 model.to(torch_device)
                 model.eval()
@@ -223,7 +251,6 @@ class CommonTestCases:
 
                 self.assertTrue(models_equal)
 
-
         def test_headmasking(self):
             if not self.test_head_masking:
                 return
@@ -278,7 +305,6 @@ class CommonTestCases:
                 self.assertNotEqual(
                     attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
 
-
         def test_head_pruning(self):
             if not self.test_pruning:
                 return
@@ -297,7 +323,8 @@ class CommonTestCases:
                 heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                 -1: [0]}
                 model.prune_heads(heads_to_prune)
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
 
                 attentions = outputs[-1]
 
@@ -326,20 +353,19 @@ class CommonTestCases:
                 heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                 -1: [0]}
                 model.prune_heads(heads_to_prune)
-                directory = "pruned_model"
-                if not os.path.exists(directory):
-                    os.makedirs(directory)
-                model.save_pretrained(directory)
-                model = model_class.from_pretrained(directory)
-                model.to(torch_device)
 
-                outputs = model(**inputs_dict)
+                with TemporaryDirectory() as temp_dir_name:
+                    model.save_pretrained(temp_dir_name)
+                    model = model_class.from_pretrained(temp_dir_name)
+                    model.to(torch_device)
+
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                 attentions = outputs[-1]
                 self.assertEqual(attentions[0].shape[-3], 1)
                 self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
                 self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
 
-                shutil.rmtree(directory)
 
         def test_head_pruning_save_load_from_config_init(self):
             if not self.test_pruning:
@@ -362,7 +388,8 @@ class CommonTestCases:
                 model.to(torch_device)
                 model.eval()
 
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                 attentions = outputs[-1]
 
                 self.assertEqual(attentions[0].shape[-3], 1)
@@ -389,7 +416,8 @@ class CommonTestCases:
                 model.to(torch_device)
                 model.eval()
 
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                 attentions = outputs[-1]
 
                 self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
@@ -397,16 +425,13 @@ class CommonTestCases:
                 self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
                 self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
 
-                directory = "pruned_model"
+                with TemporaryDirectory() as temp_dir_name:
+                    model.save_pretrained(temp_dir_name)
+                    model = model_class.from_pretrained(temp_dir_name)
+                    model.to(torch_device)
 
-                if not os.path.exists(directory):
-                    os.makedirs(directory)
-                model.save_pretrained(directory)
-                model = model_class.from_pretrained(directory)
-                model.to(torch_device)
-                shutil.rmtree(directory)
-
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                 attentions = outputs[-1]
 
                 self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
@@ -417,7 +442,8 @@ class CommonTestCases:
                 heads_to_prune = {0: [0], 2: [1, 2]}
                 model.prune_heads(heads_to_prune)
 
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                 attentions = outputs[-1]
 
                 self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
@@ -427,7 +453,6 @@ class CommonTestCases:
 
                 self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
 
-
         def test_hidden_states_output(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -437,14 +462,16 @@ class CommonTestCases:
                 model = model_class(config)
                 model.to(torch_device)
                 model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                 hidden_states = outputs[-1]
                 self.assertEqual(model.config.output_attentions, False)
                 self.assertEqual(model.config.output_hidden_states, True)
                 self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
                 self.assertListEqual(
                     list(hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size])
+                    [self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length,
+                     self.model_tester.hidden_size])
 
         def test_resize_tokens_embeddings(self):
             original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -550,8 +577,14 @@ class CommonTestCases:
 
         def test_inputs_embeds(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            input_ids = inputs_dict["input_ids"]
-            del inputs_dict["input_ids"]
+            if not self.is_encoder_decoder:
+                input_ids = inputs_dict["input_ids"]
+                del inputs_dict["input_ids"]
+            else:
+                encoder_input_ids = inputs_dict["encoder_input_ids"]
+                decoder_input_ids = inputs_dict["decoder_input_ids"]
+                del inputs_dict["encoder_input_ids"]
+                del inputs_dict["decoder_input_ids"]
 
             for model_class in self.all_model_classes:
                 model = model_class(config)
@@ -559,9 +592,14 @@ class CommonTestCases:
                 model.eval()
 
                 wte = model.get_input_embeddings()
-                inputs_dict["inputs_embeds"] = wte(input_ids)
-                outputs = model(**inputs_dict)
+                if not self.is_encoder_decoder:
+                    inputs_dict["inputs_embeds"] = wte(input_ids)
+                else:
+                    inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids)
+                    inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids)
 
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
 
     class GPTModelTester(CommonModelTester):
 
@@ -633,7 +671,7 @@ class CommonTestCases:
                 mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
 
             config = self.config_class(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 n_positions=self.n_positions,
                 n_embd=self.hidden_size,
                 n_layer=self.num_hidden_layers,
@@ -649,9 +687,10 @@ class CommonTestCases:
             model.to(torch_device)
             model.eval()
 
-            outputs = model(input_ids, position_ids, token_type_ids)
-            outputs = model(input_ids, position_ids)
-            outputs = model(input_ids)
+            with torch.no_grad():
+                outputs = model(input_ids, position_ids, token_type_ids)
+                outputs = model(input_ids, position_ids)
+                outputs = model(input_ids)
 
             hidden_state = outputs[0]
             self.parent.assertListEqual(
@@ -664,7 +703,8 @@ class CommonTestCases:
             model = self.lm_head_model_class(config)
             model.to(torch_device)
             model.eval()
-            outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
+            with torch.no_grad():
+                outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
             loss, lm_logits = outputs[:2]
 
             total_voc = self.vocab_size
@@ -681,7 +721,8 @@ class CommonTestCases:
                 model = model_class(config)
                 model.to(torch_device)
                 model.eval()
-                outputs = model(input_ids)
+                with torch.no_grad():
+                    outputs = model(input_ids)
                 presents = outputs[-1]
                 self.parent.assertEqual(self.num_hidden_layers, len(presents))
                 self.parent.assertListEqual(
@@ -694,7 +735,8 @@ class CommonTestCases:
             model = self.double_head_model_class(config)
             model.to(torch_device)
             model.eval()
-            outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
+            with torch.no_grad():
+                outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
                             token_type_ids=token_type_ids, position_ids=position_ids)
             lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
             loss = [lm_loss, mc_loss]
@@ -711,10 +753,8 @@ class CommonTestCases:
                 [[], []])
 
         def create_and_check_model_from_pretrained(self):
-            cache_dir = "/tmp/transformers_test/"
             for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
-                model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
-                shutil.rmtree(cache_dir)
+                model = self.base_model_class.from_pretrained(model_name, cache_dir=CACHE_DIR)
                 self.parent.assertIsNotNone(model)
 
         def prepare_config_and_inputs_for_common(self):
diff --git a/transformers/tests/modeling_ctrl_test.py b/transformers/tests/modeling_ctrl_test.py
index 8c14578a5c..ed0d62d1e6 100644
--- a/transformers/tests/modeling_ctrl_test.py
+++ b/transformers/tests/modeling_ctrl_test.py
@@ -16,7 +16,6 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import shutil
 import pdb
 
 from transformers import is_torch_available
@@ -27,7 +26,7 @@ if is_torch_available():
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
 @require_torch
@@ -114,7 +113,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = CTRLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 n_embd=self.hidden_size,
                 n_layer=self.num_hidden_layers,
                 n_head=self.num_attention_heads,
@@ -205,10 +204,8 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = CTRLModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = CTRLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 
diff --git a/transformers/tests/modeling_distilbert_test.py b/transformers/tests/modeling_distilbert_test.py
index 4b8f64327d..ac6f5d248e 100644
--- a/transformers/tests/modeling_distilbert_test.py
+++ b/transformers/tests/modeling_distilbert_test.py
@@ -27,7 +27,7 @@ if is_torch_available():
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
 @require_torch
@@ -105,7 +105,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = DistilBertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 dim=self.hidden_size,
                 n_layers=self.num_hidden_layers,
                 n_heads=self.num_attention_heads,
@@ -235,10 +235,8 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
 
     # @slow
     # def test_model_from_pretrained(self):
-    #     cache_dir = "/tmp/transformers_test/"
     #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
-    #         shutil.rmtree(cache_dir)
+    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
     #         self.assertIsNotNone(model)
 
 if __name__ == "__main__":
diff --git a/transformers/tests/modeling_gpt2_test.py b/transformers/tests/modeling_gpt2_test.py
index ecaa2a4bd0..ad2ec1fd91 100644
--- a/transformers/tests/modeling_gpt2_test.py
+++ b/transformers/tests/modeling_gpt2_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import shutil
 
 from transformers import is_torch_available
 
@@ -27,7 +26,7 @@ if is_torch_available():
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
 @require_torch
@@ -110,7 +109,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = GPT2Config(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 n_embd=self.hidden_size,
                 n_layer=self.num_hidden_layers,
                 n_head=self.num_attention_heads,
@@ -239,10 +238,8 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = GPT2Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 
diff --git a/transformers/tests/modeling_openai_test.py b/transformers/tests/modeling_openai_test.py
index 8e4d13438d..1880febcae 100644
--- a/transformers/tests/modeling_openai_test.py
+++ b/transformers/tests/modeling_openai_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import shutil
 
 from transformers import is_torch_available
 
@@ -27,7 +26,7 @@ if is_torch_available():
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
 @require_torch
@@ -98,7 +97,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = OpenAIGPTConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 n_embd=self.hidden_size,
                 n_layer=self.num_hidden_layers,
                 n_head=self.num_attention_heads,
@@ -207,10 +206,8 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 
diff --git a/transformers/tests/modeling_roberta_test.py b/transformers/tests/modeling_roberta_test.py
index 7a3553b164..732e589cdf 100644
--- a/transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import shutil
 
 from transformers import is_torch_available
 
@@ -25,11 +24,12 @@ if is_torch_available():
     import torch
     from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM,
                               RobertaForSequenceClassification, RobertaForTokenClassification)
+    from transformers.modeling_roberta import RobertaEmbeddings
     from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
 @require_torch
@@ -106,7 +106,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = RobertaConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 hidden_size=self.hidden_size,
                 num_hidden_layers=self.num_hidden_layers,
                 num_attention_heads=self.num_attention_heads,
@@ -199,12 +199,61 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = RobertaModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = RobertaModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+    def test_create_position_ids_respects_padding_index(self):
+        """ Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is RobertaEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        model = RobertaEmbeddings(config=config)
+
+        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
+        expected_positions = torch.as_tensor([[
+            0 + model.padding_idx + 1,
+            1 + model.padding_idx + 1,
+            2 + model.padding_idx + 1,
+            model.padding_idx
+        ]])
+
+        position_ids = model.create_position_ids_from_input_ids(input_ids)
+        self.assertEqual(
+            position_ids.shape,
+            expected_positions.shape
+        )
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+    def test_create_position_ids_from_inputs_embeds(self):
+        """ Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is RobertaEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        embeddings = RobertaEmbeddings(config=config)
+
+        inputs_embeds = torch.Tensor(2, 4, 30)
+        expected_single_positions = [
+            0 + embeddings.padding_idx + 1,
+            1 + embeddings.padding_idx + 1,
+            2 + embeddings.padding_idx + 1,
+            3 + embeddings.padding_idx + 1,
+        ]
+        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
+        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
+        self.assertEqual(
+            position_ids.shape,
+            expected_positions.shape
+        )
+        self.assertTrue(
+            torch.all(torch.eq(position_ids, expected_positions))
+        )
 
 
 class RobertaModelIntegrationTest(unittest.TestCase):
diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py
new file mode 100644
index 0000000000..9fd9a4b304
--- /dev/null
+++ b/transformers/tests/modeling_t5_test.py
@@ -0,0 +1,182 @@
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+from transformers import is_torch_available
+
+from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
+from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_torch, slow, torch_device
+
+if is_torch_available():
+    from transformers import (T5Config, T5Model, T5WithLMHeadModel)
+    from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@require_torch
+class T5ModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes = (T5Model, T5WithLMHeadModel) if is_torch_available() else ()
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    is_encoder_decoder = True
+
+    class T5ModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     encoder_seq_length=7,
+                     decoder_seq_length=9,
+                     is_training=True,
+                     use_attention_mask=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     n_positions=14,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     d_ff=37,
+                     relative_attention_num_buckets=8,
+                     dropout_rate=0.1,
+                     initializer_factor=0.002,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.encoder_seq_length = encoder_seq_length
+            self.decoder_seq_length = decoder_seq_length
+            self.is_training = is_training
+            self.use_attention_mask = use_attention_mask
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.n_positions = n_positions
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.d_ff = d_ff
+            self.relative_attention_num_buckets = relative_attention_num_buckets
+            self.dropout_rate = dropout_rate
+            self.initializer_factor = initializer_factor
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            encoder_input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+            decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+            encoder_attention_mask = None
+            decoder_attention_mask = None
+            if self.use_attention_mask:
+                encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+                decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
+
+            decoder_lm_labels = None
+            if self.use_labels:
+                decoder_lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
+
+            config = T5Config(
+                vocab_size=self.vocab_size,
+                n_positions=self.n_positions,
+                d_model=self.hidden_size,
+                d_ff=self.d_ff,
+                d_kv=self.hidden_size // self.num_attention_heads,
+                num_layers=self.num_hidden_layers,
+                num_heads=self.num_attention_heads,
+                relative_attention_num_buckets=self.relative_attention_num_buckets,
+                dropout_rate=self.dropout_rate,
+                initializer_factor=self.initializer_factor)
+
+            return (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels)
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_and_check_t5_model(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
+            model = T5Model(config=config)
+            model.eval()
+            decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
+                                                   decoder_input_ids=decoder_input_ids,
+                                                   encoder_attention_mask=encoder_attention_mask,
+                                                   decoder_attention_mask=decoder_attention_mask)
+            decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
+                                                   decoder_input_ids=decoder_input_ids)
+
+            result = {
+                "encoder_output": encoder_output,
+                "decoder_output": decoder_output,
+            }
+            self.parent.assertListEqual(
+                list(result["encoder_output"].size()),
+                [self.batch_size, self.encoder_seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(result["decoder_output"].size()),
+                [self.batch_size, self.decoder_seq_length, self.hidden_size])
+
+
+        def create_and_check_t5_with_lm_head(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
+            model = T5WithLMHeadModel(config=config)
+            model.eval()
+            outputs = model(encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids,
+                            decoder_attention_mask=decoder_attention_mask, decoder_lm_labels=decoder_lm_labels)
+            loss, prediction_scores = outputs[0], outputs[1]
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.decoder_seq_length, self.vocab_size])
+            self.check_loss_output(result)
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask,
+             decoder_attention_mask, decoder_lm_labels) = config_and_inputs
+            inputs_dict = {'encoder_input_ids': encoder_input_ids,
+                           'decoder_input_ids': decoder_input_ids,
+                           'decoder_attention_mask': decoder_attention_mask,
+                           'encoder_attention_mask': encoder_attention_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = T5ModelTest.T5ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_t5_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_model(*config_and_inputs)
+
+    def test_with_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = T5Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
+            self.assertIsNotNone(model)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/tests/modeling_tf_albert_test.py b/transformers/tests/modeling_tf_albert_test.py
index 7d3325b70b..374417cfe2 100644
--- a/transformers/tests/modeling_tf_albert_test.py
+++ b/transformers/tests/modeling_tf_albert_test.py
@@ -17,12 +17,11 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import shutil
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 
 from transformers import AlbertConfig, is_tf_available
 
@@ -118,7 +117,7 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = AlbertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 hidden_size=self.hidden_size,
                 num_hidden_layers=self.num_hidden_layers,
                 num_attention_heads=self.num_attention_heads,
@@ -217,12 +216,8 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
-        # for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        for model_name in ['albert-base-uncased']:
-            model = TFAlbertModel.from_pretrained(
-                model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+        for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = TFAlbertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 
diff --git a/transformers/tests/modeling_tf_auto_test.py b/transformers/tests/modeling_tf_auto_test.py
index 7ea48015d9..2ad39ddccf 100644
--- a/transformers/tests/modeling_tf_auto_test.py
+++ b/transformers/tests/modeling_tf_auto_test.py
@@ -22,7 +22,7 @@ import logging
 
 from transformers import is_tf_available
 
-from .utils import require_tf, slow
+from .utils import require_tf, slow, SMALL_MODEL_IDENTIFIER
 
 if is_tf_available():
     from transformers import (AutoConfig, BertConfig,
@@ -46,11 +46,11 @@ class TFAutoModelTest(unittest.TestCase):
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
         for model_name in ['bert-base-uncased']:
-            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
 
-            model = TFAutoModel.from_pretrained(model_name, force_download=True)
+            model = TFAutoModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
             self.assertIsInstance(model, TFBertModel)
 
@@ -59,11 +59,11 @@ class TFAutoModelTest(unittest.TestCase):
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
         for model_name in ['bert-base-uncased']:
-            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
 
-            model = TFAutoModelWithLMHead.from_pretrained(model_name, force_download=True)
+            model = TFAutoModelWithLMHead.from_pretrained(model_name)
             self.assertIsNotNone(model)
             self.assertIsInstance(model, TFBertForMaskedLM)
 
@@ -72,11 +72,11 @@ class TFAutoModelTest(unittest.TestCase):
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
         for model_name in ['bert-base-uncased']:
-            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
 
-            model = TFAutoModelForSequenceClassification.from_pretrained(model_name, force_download=True)
+            model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
             self.assertIsNotNone(model)
             self.assertIsInstance(model, TFBertForSequenceClassification)
 
@@ -85,14 +85,19 @@ class TFAutoModelTest(unittest.TestCase):
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
         for model_name in ['bert-base-uncased']:
-            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
 
-            model = TFAutoModelForQuestionAnswering.from_pretrained(model_name, force_download=True)
+            model = TFAutoModelForQuestionAnswering.from_pretrained(model_name)
             self.assertIsNotNone(model)
             self.assertIsInstance(model, TFBertForQuestionAnswering)
 
+    def test_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(model, TFBertForMaskedLM)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/modeling_tf_bert_test.py b/transformers/tests/modeling_tf_bert_test.py
index d7a86fecb9..abf20b1514 100644
--- a/transformers/tests/modeling_tf_bert_test.py
+++ b/transformers/tests/modeling_tf_bert_test.py
@@ -17,12 +17,11 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import shutil
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 
 from transformers import BertConfig, is_tf_available
 
@@ -114,7 +113,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = BertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 hidden_size=self.hidden_size,
                 num_hidden_layers=self.num_hidden_layers,
                 num_attention_heads=self.num_attention_heads,
@@ -310,11 +309,9 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
         for model_name in ['bert-base-uncased']:
-            model = TFBertModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = TFBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 if __name__ == "__main__":
diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
index 439360ba35..5a5873e81b 100644
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -69,6 +69,7 @@ class TFCommonTestCases:
         test_torchscript = True
         test_pruning = True
         test_resize_embeddings = True
+        is_encoder_decoder = False
 
         def test_initialization(self):
             pass
@@ -129,8 +130,12 @@ class TFCommonTestCases:
                                       for name, key in inputs_dict.items())
                 with torch.no_grad():
                     pto = pt_model(**pt_inputs_dict)
-                tfo = tf_model(inputs_dict)
-                max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy()))
+                tfo = tf_model(inputs_dict, training=False)
+                tf_hidden_states = tfo[0].numpy()
+                pt_hidden_states = pto[0].numpy()
+                tf_hidden_states[np.isnan(tf_hidden_states)] = 0
+                pt_hidden_states[np.isnan(pt_hidden_states)] = 0
+                max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
                 self.assertLessEqual(max_diff, 2e-2)
 
                 # Check we can load pt model in tf and vice-versa with checkpoint => model functions
@@ -150,13 +155,21 @@ class TFCommonTestCases:
                 with torch.no_grad():
                     pto = pt_model(**pt_inputs_dict)
                 tfo = tf_model(inputs_dict)
-                max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy()))
+                tfo = tfo[0].numpy()
+                pto = pto[0].numpy()
+                tfo[np.isnan(tfo)] = 0
+                pto[np.isnan(pto)] = 0
+                max_diff = np.amax(np.abs(tfo - pto))
                 self.assertLessEqual(max_diff, 2e-2)
 
         def test_compile_tf_model(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-            input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
+            if self.is_encoder_decoder:
+                input_ids = {'decoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='decoder_input_ids', dtype='int32'),
+                             'encoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='encoder_input_ids', dtype='int32')}
+            else:
+                input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
             optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
             loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
             metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
@@ -189,7 +202,7 @@ class TFCommonTestCases:
                 outputs_dict = model(inputs_dict)
 
                 inputs_keywords = copy.deepcopy(inputs_dict)
-                input_ids = inputs_keywords.pop('input_ids')
+                input_ids = inputs_keywords.pop('input_ids' if not self.is_encoder_decoder else 'decoder_input_ids', None)
                 outputs_keywords = model(input_ids, **inputs_keywords)
 
                 output_dict = outputs_dict[0].numpy()
@@ -200,6 +213,11 @@ class TFCommonTestCases:
         def test_attention_outputs(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
+            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
+            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
+            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
+            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
+
             for model_class in self.all_model_classes:
                 config.output_attentions = True
                 config.output_hidden_states = False
@@ -212,16 +230,28 @@ class TFCommonTestCases:
                 self.assertListEqual(
                     list(attentions[0].shape[-3:]),
                     [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_seq_length,
+                    encoder_key_length])
                 out_len = len(outputs)
 
+                if self.is_encoder_decoder:
+                    self.assertEqual(out_len % 2, 0)
+                    decoder_attentions = outputs[(out_len // 2)-1]
+                    self.assertEqual(model.config.output_attentions, True)
+                    self.assertEqual(model.config.output_hidden_states, False)
+                    self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                    self.assertListEqual(
+                        list(decoder_attentions[0].shape[-3:]),
+                        [self.model_tester.num_attention_heads,
+                         decoder_seq_length,
+                         decoder_key_length])
+
                 # Check attention is always last and order is fine
                 config.output_attentions = True
                 config.output_hidden_states = True
                 model = model_class(config)
                 outputs = model(inputs_dict)
-                self.assertEqual(out_len+1, len(outputs))
+                self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
                 self.assertEqual(model.config.output_attentions, True)
                 self.assertEqual(model.config.output_hidden_states, True)
 
@@ -230,8 +260,8 @@ class TFCommonTestCases:
                 self.assertListEqual(
                     list(attentions[0].shape[-3:]),
                     [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_seq_length,
+                    encoder_key_length])
 
         def test_hidden_states_output(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -264,35 +294,53 @@ class TFCommonTestCases:
             for model_class in self.all_model_classes:
                 model = model_class(config)
                 first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
-                self.assertTrue(tf.math.equal(first, second).numpy().all())
+                out_1 = first.numpy()
+                out_2 = second.numpy()
+                out_1 = out_1[~np.isnan(out_1)]
+                out_2 = out_2[~np.isnan(out_2)]
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
+
+        def _get_embeds(self, wte, input_ids):
+            # ^^ In our TF models, the input_embeddings can take slightly different forms,
+            # so we try a few of them.
+            # We used to fall back to just synthetically creating a dummy tensor of ones:
+            try:
+                x = wte(input_ids, mode="embedding")
+            except:
+                try:
+                    x = wte([input_ids], mode="embedding")
+                except:
+                    try:
+                        x = wte([input_ids, None, None, None], mode="embedding")
+                    except:
+                        if hasattr(self.model_tester, "embedding_size"):
+                            x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
+                        else:
+                            x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
+            return x
 
         def test_inputs_embeds(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            input_ids = inputs_dict["input_ids"]
-            del inputs_dict["input_ids"]
+            if not self.is_encoder_decoder:
+                input_ids = inputs_dict["input_ids"]
+                del inputs_dict["input_ids"]
+            else:
+                encoder_input_ids = inputs_dict["encoder_input_ids"]
+                decoder_input_ids = inputs_dict["decoder_input_ids"]
+                del inputs_dict["encoder_input_ids"]
+                del inputs_dict["decoder_input_ids"]
 
             for model_class in self.all_model_classes:
                 model = model_class(config)
 
                 wte = model.get_input_embeddings()
-                try:
-                    x = wte(input_ids, mode="embedding")
-                except:
-                    try:
-                        x = wte([input_ids], mode="embedding")
-                    except:
-                        try:
-                            x = wte([input_ids, None, None, None], mode="embedding")
-                        except:
-                            if hasattr(self.model_tester, "embedding_size"):
-                                x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
-                            else:
-                                x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
-                # ^^ In our TF models, the input_embeddings can take slightly different forms,
-                # so we try a few of them.
-                # We used to fall back to just synthetically creating a dummy tensor of ones:
-                #
-                inputs_dict["inputs_embeds"] = x
+                if not self.is_encoder_decoder:
+                    inputs_dict["inputs_embeds"] = self._get_embeds(wte, input_ids)
+                else:
+                    inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids)
+                    inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids)
+
                 outputs = model(inputs_dict)
 
 
diff --git a/transformers/tests/modeling_tf_ctrl_test.py b/transformers/tests/modeling_tf_ctrl_test.py
index 0b421c20c9..93b231e517 100644
--- a/transformers/tests/modeling_tf_ctrl_test.py
+++ b/transformers/tests/modeling_tf_ctrl_test.py
@@ -17,12 +17,11 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import shutil
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 
 from transformers import CTRLConfig, is_tf_available
 
@@ -112,7 +111,7 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = CTRLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 n_embd=self.hidden_size,
                 n_layer=self.num_hidden_layers,
                 n_head=self.num_attention_heads,
@@ -189,10 +188,8 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFCTRLModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = TFCTRLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 if __name__ == "__main__":
diff --git a/transformers/tests/modeling_tf_distilbert_test.py b/transformers/tests/modeling_tf_distilbert_test.py
index 0ec45150ca..f28b5c397b 100644
--- a/transformers/tests/modeling_tf_distilbert_test.py
+++ b/transformers/tests/modeling_tf_distilbert_test.py
@@ -20,7 +20,7 @@ import unittest
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 
 from transformers import DistilBertConfig, is_tf_available
 
@@ -107,7 +107,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = DistilBertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 dim=self.hidden_size,
                 n_layers=self.num_hidden_layers,
                 n_heads=self.num_attention_heads,
@@ -211,10 +211,8 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
 
     # @slow
     # def test_model_from_pretrained(self):
-    #     cache_dir = "/tmp/transformers_test/"
     #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
-    #         shutil.rmtree(cache_dir)
+    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
     #         self.assertIsNotNone(model)
 
 if __name__ == "__main__":
diff --git a/transformers/tests/modeling_tf_gpt2_test.py b/transformers/tests/modeling_tf_gpt2_test.py
index e070b72e65..90920342ba 100644
--- a/transformers/tests/modeling_tf_gpt2_test.py
+++ b/transformers/tests/modeling_tf_gpt2_test.py
@@ -17,12 +17,11 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import shutil
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 
 from transformers import GPT2Config, is_tf_available
 
@@ -115,7 +114,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = GPT2Config(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 n_embd=self.hidden_size,
                 n_layer=self.num_hidden_layers,
                 n_head=self.num_attention_heads,
@@ -220,10 +219,8 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFGPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = TFGPT2Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 if __name__ == "__main__":
diff --git a/transformers/tests/modeling_tf_openai_gpt_test.py b/transformers/tests/modeling_tf_openai_gpt_test.py
index 675e806c12..065bf2acde 100644
--- a/transformers/tests/modeling_tf_openai_gpt_test.py
+++ b/transformers/tests/modeling_tf_openai_gpt_test.py
@@ -17,12 +17,11 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import shutil
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 
 from transformers import OpenAIGPTConfig, is_tf_available
 
@@ -114,7 +113,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = OpenAIGPTConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 n_embd=self.hidden_size,
                 n_layer=self.num_hidden_layers,
                 n_head=self.num_attention_heads,
@@ -219,10 +218,8 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFOpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = TFOpenAIGPTModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 if __name__ == "__main__":
diff --git a/transformers/tests/modeling_tf_roberta_test.py b/transformers/tests/modeling_tf_roberta_test.py
index 42440bf1b7..93c478ae28 100644
--- a/transformers/tests/modeling_tf_roberta_test.py
+++ b/transformers/tests/modeling_tf_roberta_test.py
@@ -17,11 +17,10 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import shutil
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 
 from transformers import RobertaConfig, is_tf_available
 
@@ -109,7 +108,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = RobertaConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 hidden_size=self.hidden_size,
                 num_hidden_layers=self.num_hidden_layers,
                 num_attention_heads=self.num_attention_heads,
@@ -192,10 +191,8 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFRobertaModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = TFRobertaModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 
diff --git a/transformers/tests/modeling_tf_t5_test.py b/transformers/tests/modeling_tf_t5_test.py
new file mode 100644
index 0000000000..da9ce6f89d
--- /dev/null
+++ b/transformers/tests/modeling_tf_t5_test.py
@@ -0,0 +1,169 @@
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import sys
+
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+from .utils import CACHE_DIR, require_tf, slow
+
+from transformers import T5Config, is_tf_available
+
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel,
+                                             TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+
+@require_tf
+class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
+
+    is_encoder_decoder = True
+    all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if is_tf_available() else ()
+
+    class TFT5ModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     n_positions=14,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     d_ff=37,
+                     relative_attention_num_buckets=8,
+                     dropout_rate=0.1,
+                     initializer_factor=0.002,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.n_positions = n_positions
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.d_ff = d_ff
+            self.relative_attention_num_buckets = relative_attention_num_buckets
+            self.dropout_rate = dropout_rate
+            self.initializer_factor = initializer_factor
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_labels = None
+            if self.use_labels:
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            config = T5Config(
+                vocab_size=self.vocab_size,
+                n_positions=self.n_positions,
+                d_model=self.hidden_size,
+                d_ff=self.d_ff,
+                d_kv=self.hidden_size // self.num_attention_heads,
+                num_layers=self.num_hidden_layers,
+                num_heads=self.num_attention_heads,
+                relative_attention_num_buckets=self.relative_attention_num_buckets,
+                dropout_rate=self.dropout_rate,
+                initializer_factor=self.initializer_factor)
+
+            return (config, input_ids, input_mask, token_labels)
+
+        def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
+            model = TFT5Model(config=config)
+            inputs = {'encoder_input_ids': input_ids,
+                      'decoder_input_ids': input_ids,
+                      'decoder_attention_mask': input_mask}
+            encoder_output, decoder_output = model(inputs)
+
+            encoder_output, decoder_output = model(input_ids,
+                                                   decoder_attention_mask=input_mask,
+                                                   encoder_input_ids=input_ids)
+
+            result = {
+                "encoder_output": encoder_output.numpy(),
+                "decoder_output": decoder_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["encoder_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(result["decoder_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+
+
+        def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
+            model = TFT5WithLMHeadModel(config=config)
+            inputs = {'encoder_input_ids': input_ids,
+                      'decoder_input_ids': input_ids,
+                      'decoder_attention_mask': input_mask}
+            prediction_scores, decoder_output = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, input_mask, token_labels) = config_and_inputs
+            inputs_dict = {'encoder_input_ids': input_ids,
+                           'decoder_input_ids': input_ids,
+                           'decoder_attention_mask': input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = TFT5ModelTest.TFT5ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_t5_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_model(*config_and_inputs)
+
+    def test_with_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in ['t5-small']:
+            model = TFT5Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
+            self.assertIsNotNone(model)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/tests/modeling_tf_transfo_xl_test.py b/transformers/tests/modeling_tf_transfo_xl_test.py
index 03e332bdc1..8225c09275 100644
--- a/transformers/tests/modeling_tf_transfo_xl_test.py
+++ b/transformers/tests/modeling_tf_transfo_xl_test.py
@@ -18,11 +18,10 @@ from __future__ import print_function
 
 import unittest
 import random
-import shutil
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 
 from transformers import TransfoXLConfig, is_tf_available
 
@@ -67,7 +66,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
             self.batch_size = batch_size
             self.seq_length = seq_length
             self.mem_len = mem_len
-            self.key_len = seq_length + mem_len
+            self.key_length = seq_length + mem_len
             self.clamp_len = clamp_len
             self.is_training = is_training
             self.use_labels = use_labels
@@ -92,7 +91,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
                 lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             config = TransfoXLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 mem_len=self.mem_len,
                 clamp_len=self.clamp_len,
                 cutoffs=self.cutoffs,
@@ -205,10 +204,8 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFTransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = TFTransfoXLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 
diff --git a/transformers/tests/modeling_tf_xlm_test.py b/transformers/tests/modeling_tf_xlm_test.py
index a680b70367..065d355b45 100644
--- a/transformers/tests/modeling_tf_xlm_test.py
+++ b/transformers/tests/modeling_tf_xlm_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import shutil
 
 from transformers import is_tf_available
 
@@ -31,7 +30,7 @@ if is_tf_available():
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 
 
 @require_tf
@@ -125,7 +124,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
                 is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
 
             config = XLMConfig(
-                 vocab_size_or_config_json_file=self.vocab_size,
+                 vocab_size=self.vocab_size,
                  n_special=self.n_special,
                  emb_dim=self.hidden_size,
                  n_layers=self.num_hidden_layers,
@@ -252,10 +251,8 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = TFXLMModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 
diff --git a/transformers/tests/modeling_tf_xlnet_test.py b/transformers/tests/modeling_tf_xlnet_test.py
index 94864b86f2..15fd917481 100644
--- a/transformers/tests/modeling_tf_xlnet_test.py
+++ b/transformers/tests/modeling_tf_xlnet_test.py
@@ -20,7 +20,6 @@ import os
 import unittest
 import json
 import random
-import shutil
 
 from transformers import XLNetConfig, is_tf_available
 
@@ -35,7 +34,7 @@ if is_tf_available():
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_tf, slow
+from .utils import CACHE_DIR, require_tf, slow
 
 
 @require_tf
@@ -64,7 +63,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
                      num_attention_heads=4,
                      d_inner=128,
                      num_hidden_layers=5,
-                     max_position_embeddings=10,
                      type_sequence_label_size=2,
                      untie_r=True,
                      bi_data=False,
@@ -88,7 +86,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
             self.num_attention_heads = num_attention_heads
             self.d_inner = d_inner
             self.num_hidden_layers = num_hidden_layers
-            self.max_position_embeddings = max_position_embeddings
             self.bi_data = bi_data
             self.untie_r = untie_r
             self.same_length = same_length
@@ -122,13 +119,12 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
                 is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
 
             config = XLNetConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 d_model=self.hidden_size,
                 n_head=self.num_attention_heads,
                 d_inner=self.d_inner,
                 n_layer=self.num_hidden_layers,
                 untie_r=self.untie_r,
-                max_position_embeddings=self.max_position_embeddings,
                 mem_len=self.mem_len,
                 clamp_len=self.clamp_len,
                 same_length=self.same_length,
@@ -322,10 +318,8 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TFXLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = TFXLNetModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 
diff --git a/transformers/tests/modeling_transfo_xl_test.py b/transformers/tests/modeling_transfo_xl_test.py
index 647dd3724d..acbe95fe4a 100644
--- a/transformers/tests/modeling_transfo_xl_test.py
+++ b/transformers/tests/modeling_transfo_xl_test.py
@@ -18,7 +18,6 @@ from __future__ import print_function
 
 import unittest
 import random
-import shutil
 
 from transformers import is_torch_available
 
@@ -29,7 +28,7 @@ if is_torch_available():
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
 @require_torch
@@ -66,7 +65,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
             self.batch_size = batch_size
             self.seq_length = seq_length
             self.mem_len = mem_len
-            self.key_len = seq_length + mem_len
+            self.key_length = seq_length + mem_len
             self.clamp_len = clamp_len
             self.is_training = is_training
             self.use_labels = use_labels
@@ -91,7 +90,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
                 lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             config = TransfoXLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 mem_len=self.mem_len,
                 clamp_len=self.clamp_len,
                 cutoffs=self.cutoffs,
@@ -208,10 +207,8 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = TransfoXLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 
diff --git a/transformers/tests/modeling_xlm_test.py b/transformers/tests/modeling_xlm_test.py
index f6b980767c..fcc2f4699b 100644
--- a/transformers/tests/modeling_xlm_test.py
+++ b/transformers/tests/modeling_xlm_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import shutil
 
 from transformers import is_torch_available
 
@@ -28,7 +27,7 @@ if is_torch_available():
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
 @require_torch
@@ -121,7 +120,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                 is_impossible_labels = ids_tensor([self.batch_size], 2).float()
 
             config = XLMConfig(
-                 vocab_size_or_config_json_file=self.vocab_size,
+                 vocab_size=self.vocab_size,
                  n_special=self.n_special,
                  emb_dim=self.hidden_size,
                  n_layers=self.num_hidden_layers,
@@ -318,10 +317,8 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = XLMModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 
diff --git a/transformers/tests/modeling_xlnet_test.py b/transformers/tests/modeling_xlnet_test.py
index 56b6bb3f4d..6d218d6ef4 100644
--- a/transformers/tests/modeling_xlnet_test.py
+++ b/transformers/tests/modeling_xlnet_test.py
@@ -20,7 +20,6 @@ import os
 import unittest
 import json
 import random
-import shutil
 
 from transformers import is_torch_available
 
@@ -33,7 +32,7 @@ if is_torch_available():
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
 @require_torch
@@ -60,7 +59,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                      num_attention_heads=4,
                      d_inner=128,
                      num_hidden_layers=5,
-                     max_position_embeddings=10,
                      type_sequence_label_size=2,
                      untie_r=True,
                      bi_data=False,
@@ -84,7 +82,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
             self.num_attention_heads = num_attention_heads
             self.d_inner = d_inner
             self.num_hidden_layers = num_hidden_layers
-            self.max_position_embeddings = max_position_embeddings
             self.bi_data = bi_data
             self.untie_r = untie_r
             self.same_length = same_length
@@ -116,13 +113,12 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
 
             config = XLNetConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 d_model=self.hidden_size,
                 n_head=self.num_attention_heads,
                 d_inner=self.d_inner,
                 n_layer=self.num_hidden_layers,
                 untie_r=self.untie_r,
-                max_position_embeddings=self.max_position_embeddings,
                 mem_len=self.mem_len,
                 clamp_len=self.clamp_len,
                 same_length=self.same_length,
@@ -388,10 +384,8 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
 
     @slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
         for model_name in list(XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = XLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = XLNetModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
 
diff --git a/transformers/tests/pipelines_test.py b/transformers/tests/pipelines_test.py
new file mode 100644
index 0000000000..08a1507770
--- /dev/null
+++ b/transformers/tests/pipelines_test.py
@@ -0,0 +1,210 @@
+import unittest
+
+from typing import Iterable
+
+from transformers import pipeline
+from transformers.tests.utils import require_tf, require_torch
+
+QA_FINETUNED_MODELS = {
+    ('bert-base-uncased', 'bert-large-uncased-whole-word-masking-finetuned-squad', None),
+    ('bert-base-cased', 'bert-large-cased-whole-word-masking-finetuned-squad', None),
+    ('bert-base-uncased', 'distilbert-base-uncased-distilled-squad', None)
+}
+
+TF_QA_FINETUNED_MODELS = {
+    ('bert-base-uncased', 'bert-large-uncased-whole-word-masking-finetuned-squad', None),
+    ('bert-base-cased', 'bert-large-cased-whole-word-masking-finetuned-squad', None),
+    ('bert-base-uncased', 'distilbert-base-uncased-distilled-squad', None)
+}
+
+TF_NER_FINETUNED_MODELS = {
+    (
+        'bert-base-cased',
+        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5',
+        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json'
+    )
+}
+
+NER_FINETUNED_MODELS = {
+    (
+        'bert-base-cased',
+        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin',
+        'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json'
+    )
+}
+
+FEATURE_EXTRACT_FINETUNED_MODELS = {
+   ('bert-base-cased', 'bert-base-cased', None),
+   # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
+   ('distilbert-base-uncased', 'distilbert-base-uncased', None)
+}
+
+TF_FEATURE_EXTRACT_FINETUNED_MODELS = {
+   ('bert-base-cased', 'bert-base-cased', None),
+   # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
+   ('distilbert-base-uncased', 'distilbert-base-uncased', None)
+}
+
+TF_TEXT_CLASSIF_FINETUNED_MODELS = {
+    (
+        'bert-base-uncased',
+        'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5',
+        'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json'
+    )
+}
+
+TEXT_CLASSIF_FINETUNED_MODELS = {
+    (
+        'bert-base-uncased',
+        'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin',
+        'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json'
+    )
+}
+
+
+class MonoColumnInputTestCase(unittest.TestCase):
+    def _test_mono_column_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]):
+        self.assertIsNotNone(nlp)
+
+        mono_result = nlp(valid_inputs[0])
+        self.assertIsInstance(mono_result, list)
+        self.assertIsInstance(mono_result[0], (dict, list))
+
+        if isinstance(mono_result[0], list):
+            mono_result = mono_result[0]
+
+        for key in output_keys:
+            self.assertIn(key, mono_result[0])
+
+        multi_result = nlp(valid_inputs)
+        self.assertIsInstance(multi_result, list)
+        self.assertIsInstance(multi_result[0], (dict, list))
+
+        if isinstance(multi_result[0], list):
+            multi_result = multi_result[0]
+
+        for result in multi_result:
+            for key in output_keys:
+                self.assertIn(key, result)
+
+        self.assertRaises(Exception, nlp, invalid_inputs)
+
+    @require_torch
+    def test_ner(self):
+        mandatory_keys = {'entity', 'word', 'score'}
+        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        invalid_inputs = [None]
+        for tokenizer, model, config in NER_FINETUNED_MODELS:
+            nlp = pipeline(task='ner', model=model, config=config, tokenizer=tokenizer)
+            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
+
+    @require_tf
+    def test_tf_ner(self):
+        mandatory_keys = {'entity', 'word', 'score'}
+        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        invalid_inputs = [None]
+        for tokenizer, model, config in TF_NER_FINETUNED_MODELS:
+            nlp = pipeline(task='ner', model=model, config=config, tokenizer=tokenizer)
+            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
+
+    @require_torch
+    def test_sentiment_analysis(self):
+        mandatory_keys = {'label'}
+        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        invalid_inputs = [None]
+        for tokenizer, model, config in TEXT_CLASSIF_FINETUNED_MODELS:
+            nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
+            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
+
+    @require_tf
+    def test_tf_sentiment_analysis(self):
+        mandatory_keys = {'label'}
+        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        invalid_inputs = [None]
+        for tokenizer, model, config in TF_TEXT_CLASSIF_FINETUNED_MODELS:
+            nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
+            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
+
+    @require_torch
+    def test_features_extraction(self):
+        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        invalid_inputs = [None]
+        for tokenizer, model, config in FEATURE_EXTRACT_FINETUNED_MODELS:
+            nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
+            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
+
+    @require_tf
+    def test_tf_features_extraction(self):
+        valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris']
+        invalid_inputs = [None]
+        for tokenizer, model, config in TF_FEATURE_EXTRACT_FINETUNED_MODELS:
+            nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer)
+            self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
+
+
+class MultiColumnInputTestCase(unittest.TestCase):
+    def _test_multicolumn_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]):
+        self.assertIsNotNone(nlp)
+
+        mono_result = nlp(valid_inputs[0])
+        self.assertIsInstance(mono_result, dict)
+
+        for key in output_keys:
+            self.assertIn(key, mono_result)
+
+        multi_result = nlp(valid_inputs)
+        self.assertIsInstance(multi_result, list)
+        self.assertIsInstance(multi_result[0], dict)
+
+        for result in multi_result:
+            for key in output_keys:
+                self.assertIn(key, result)
+
+        self.assertRaises(Exception, nlp, invalid_inputs[0])
+        self.assertRaises(Exception, nlp, invalid_inputs)
+
+    @require_torch
+    def test_question_answering(self):
+        mandatory_output_keys = {'score', 'answer', 'start', 'end'}
+        valid_samples = [
+            {'question': 'Where was HuggingFace founded ?', 'context': 'HuggingFace was founded in Paris.'},
+            {
+                'question': 'In what field is HuggingFace working ?',
+                'context': 'HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.'
+            }
+        ]
+        invalid_samples = [
+            {'question': '', 'context': 'This is a test to try empty question edge case'},
+            {'question': None, 'context': 'This is a test to try empty question edge case'},
+            {'question': 'What is does with empty context ?', 'context': ''},
+            {'question': 'What is does with empty context ?', 'context': None},
+        ]
+
+        for tokenizer, model, config in QA_FINETUNED_MODELS:
+            nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer)
+            self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys)
+
+    @require_tf
+    def test_tf_question_answering(self):
+        mandatory_output_keys = {'score', 'answer', 'start', 'end'}
+        valid_samples = [
+            {'question': 'Where was HuggingFace founded ?', 'context': 'HuggingFace was founded in Paris.'},
+            {
+                'question': 'In what field is HuggingFace working ?',
+                'context': 'HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.'
+            }
+        ]
+        invalid_samples = [
+            {'question': '', 'context': 'This is a test to try empty question edge case'},
+            {'question': None, 'context': 'This is a test to try empty question edge case'},
+            {'question': 'What is does with empty context ?', 'context': ''},
+            {'question': 'What is does with empty context ?', 'context': None},
+        ]
+
+        for tokenizer, model, config in TF_QA_FINETUNED_MODELS:
+            nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer)
+            self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/transformers/tests/tokenization_auto_test.py b/transformers/tests/tokenization_auto_test.py
index 18346d2768..0a894cac04 100644
--- a/transformers/tests/tokenization_auto_test.py
+++ b/transformers/tests/tokenization_auto_test.py
@@ -23,7 +23,7 @@ import logging
 from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
 from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-from .utils import slow
+from .utils import slow, SMALL_MODEL_IDENTIFIER
 
 
 class AutoTokenizerTest(unittest.TestCase):
@@ -42,6 +42,11 @@ class AutoTokenizerTest(unittest.TestCase):
             self.assertIsInstance(tokenizer, GPT2Tokenizer)
             self.assertGreater(len(tokenizer), 0)
 
+    def test_tokenizer_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(tokenizer, BertTokenizer)
+        self.assertEqual(len(tokenizer), 12)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_bert_japanese_test.py b/transformers/tests/tokenization_bert_japanese_test.py
new file mode 100644
index 0000000000..545193c7cc
--- /dev/null
+++ b/transformers/tests/tokenization_bert_japanese_test.py
@@ -0,0 +1,191 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+from io import open
+
+from transformers.tokenization_bert import WordpieceTokenizer
+from transformers.tokenization_bert_japanese import (BertJapaneseTokenizer,
+                                                     MecabTokenizer, CharacterTokenizer,
+                                                     VOCAB_FILES_NAMES)
+
+from .tokenization_tests_commons import CommonTestCases
+from .utils import slow, custom_tokenizers
+
+
+@custom_tokenizers
+class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = BertJapaneseTokenizer
+
+    def setUp(self):
+        super(BertJapaneseTokenizationTest, self).setUp()
+
+        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
+            u"こんにちは", u"こん", u"にちは", u"ばんは", u"##こん", u"##にちは", u"##ばんは",
+            u"世界", u"##世界", u"、", u"##、", u"。", u"##。"]
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_tokenizer(self, **kwargs):
+        return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"こんにちは、世界。 \nこんばんは、世界。"
+        output_text = u"こんにちは 、 世界 。 こんばんは 、 世界 。"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+
+        tokens = tokenizer.tokenize(u"こんにちは、世界。\nこんばんは、世界。")
+        self.assertListEqual(tokens,
+                             [u"こんにちは", u"、", u"世界", u"。",
+                              u"こん", u"##ばんは", u"、", u"世界", "。"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
+                             [3, 12, 10, 14, 4, 9, 12, 10, 14])
+
+    def test_mecab_tokenizer(self):
+        tokenizer = MecabTokenizer()
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+                               [u"アップルストア", u"で", u"iPhone", u"8", u"が",
+                                u"発売", u"さ", u"れ", u"た", u"。"])
+
+    def test_mecab_tokenizer_lower(self):
+        tokenizer = MecabTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+                               [u"アップルストア", u"で", u"iphone", u"8", u"が",
+                                u"発売", u"さ", u"れ", u"た", u"。"])
+
+    def test_mecab_tokenizer_no_normalize(self):
+        tokenizer = MecabTokenizer(normalize_text=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+                               [u"ｱｯﾌﾟﾙストア", u"で", u"iPhone", u"８", u"が",
+                                u"発売", u"さ", u"れ", u"た", u"　", u"。"])
+
+    def test_wordpiece_tokenizer(self):
+        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
+            u"こんにちは", u"こん", u"にちは" u"ばんは", u"##こん", u"##にちは", u"##ばんは"]
+
+        vocab = {}
+        for (i, token) in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token=u"[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(u""), [])
+
+        self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
+                             [u"こんにちは"])
+
+        self.assertListEqual(tokenizer.tokenize(u"こんばんは"),
+                             [u"こん", u"##ばんは"])
+
+        self.assertListEqual(tokenizer.tokenize(u"こんばんは こんばんにちは こんにちは"),
+                             [u"こん", u"##ばんは", u"[UNK]", u"こんにちは"])
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese")
+
+        text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
+        text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        # 2 is for "[CLS]", 3 is for "[SEP]"
+        assert encoded_sentence == [2] + text + [3]
+        assert encoded_pair == [2] + text + [3] + text_2 + [3]
+
+
+class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = BertJapaneseTokenizer
+
+    def setUp(self):
+        super(BertJapaneseCharacterTokenizationTest, self).setUp()
+
+        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
+            u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界", u"、", u"。"]
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_tokenizer(self, **kwargs):
+        return BertJapaneseTokenizer.from_pretrained(self.tmpdirname,
+                                                     subword_tokenizer_type="character",
+                                                     **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"こんにちは、世界。 \nこんばんは、世界。"
+        output_text = u"こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file,
+                                         subword_tokenizer_type="character")
+
+        tokens = tokenizer.tokenize(u"こんにちは、世界。 \nこんばんは、世界。")
+        self.assertListEqual(tokens,
+            [u"こ", u"ん", u"に", u"ち", u"は", u"、", u"世", u"界", u"。",
+             u"こ", u"ん", u"ば", u"ん", u"は", u"、", u"世", u"界", u"。"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
+                             [3, 4, 5, 6, 7, 11, 9, 10, 12,
+                              3, 4, 8, 4, 7, 11, 9, 10, 12])
+
+    def test_character_tokenizer(self):
+        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
+            u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界"u"、", u"。"]
+
+        vocab = {}
+        for (i, token) in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = CharacterTokenizer(vocab=vocab, unk_token=u"[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(u""), [])
+
+        self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
+                             [u"こ", u"ん", u"に", u"ち", u"は"])
+
+        self.assertListEqual(tokenizer.tokenize(u"こんにちほ"),
+                             [u"こ", u"ん", u"に", u"ち", u"[UNK]"])
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese-char")
+
+        text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
+        text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        # 2 is for "[CLS]", 3 is for "[SEP]"
+        assert encoded_sentence == [2] + text + [3]
+        assert encoded_pair == [2] + text + [3] + text_2 + [3]
+
+
+
diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py
index f390248956..c503ea5e1e 100644
--- a/transformers/tests/tokenization_bert_test.py
+++ b/transformers/tests/tokenization_bert_test.py
@@ -139,5 +139,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         assert encoded_sentence == [101] + text + [102]
         assert encoded_pair == [101] + text + [102] + text_2 + [102]
 
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/transformers/tests/tokenization_gpt2_test.py b/transformers/tests/tokenization_gpt2_test.py
index a77cc75ec2..5eae767bdf 100644
--- a/transformers/tests/tokenization_gpt2_test.py
+++ b/transformers/tests/tokenization_gpt2_test.py
@@ -67,6 +67,5 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/transformers/tests/tokenization_t5_test.py b/transformers/tests/tokenization_t5_test.py
new file mode 100644
index 0000000000..0b4f960e32
--- /dev/null
+++ b/transformers/tests/tokenization_t5_test.py
@@ -0,0 +1,77 @@
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+
+from transformers.tokenization_t5 import (T5Tokenizer)
+from transformers.tokenization_xlnet import SPIECE_UNDERLINE
+
+from .tokenization_tests_commons import CommonTestCases
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                    'fixtures/test_sentencepiece.model')
+
+class T5TokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = T5Tokenizer
+
+    def setUp(self):
+        super(T5TokenizationTest, self).setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return T5Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"This is a test"
+        output_text = u"This is a test"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB)
+
+        tokens = tokenizer.tokenize(u'This is a test')
+        self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
+
+        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
+        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                    u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
+                                    u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                    SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids, [8, 21, 84, 55, 24, 19, 7, 0,
+                602, 347, 347, 347, 3, 12, 66,
+                46, 72, 80, 6, 0, 4])
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                        u'or', u'n', SPIECE_UNDERLINE + u'in',
+                                        SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
+                                        SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                        SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
+                                        u'<unk>', u'.'])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index c009958135..c417d033dc 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -133,6 +133,13 @@ class CommonTestCases:
             self.assertNotEqual(len(toks), len(toks0))  # toks0 should be longer
             self.assertListEqual(toks, toks2)
 
+            # Check that none of the special tokens are lowercased
+            sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B"
+            tokenized_sequence = tokenizer.tokenize(sequence_with_special_tokens)
+
+            for special_token in tokenizer.all_special_tokens:
+                self.assertTrue(special_token in tokenized_sequence)
+
             tokenizer = self.get_tokenizer(do_lower_case=False)
 
             added = tokenizer.add_tokens(new_toks)
@@ -232,6 +239,15 @@ class CommonTestCases:
             self.assertNotEqual(len(tokens_2), 0)
             self.assertIsInstance(text_2, (str, unicode))
 
+        def test_encode_decode_with_spaces(self):
+            tokenizer = self.get_tokenizer()
+
+            new_toks = ['[ABC]', '[DEF]', 'GHI IHG']
+            tokenizer.add_tokens(new_toks)
+            input = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
+            encoded = tokenizer.encode(input, add_special_tokens=False)
+            decoded = tokenizer.decode(encoded)
+            self.assertEqual(decoded, input)
 
         def test_pretrained_model_lists(self):
             weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py
index 7a51ab612b..ba0e19f420 100644
--- a/transformers/tests/utils.py
+++ b/transformers/tests/utils.py
@@ -1,23 +1,34 @@
 import os
 import unittest
+import tempfile
 
 from distutils.util import strtobool
 
 from transformers.file_utils import _tf_available, _torch_available
 
 
-try:
-    run_slow = os.environ["RUN_SLOW"]
-except KeyError:
-    # RUN_SLOW isn't set, default to skipping slow tests.
-    _run_slow_tests = False
-else:
-    # RUN_SLOW is set, convert it to True or False.
+CACHE_DIR = os.path.join(tempfile.gettempdir(), "transformers_test")
+
+SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
+
+
+def parse_flag_from_env(key, default=False):
     try:
-        _run_slow_tests = strtobool(run_slow)
-    except ValueError:
-        # More values are supported, but let's keep the message simple.
-        raise ValueError("If set, RUN_SLOW must be yes or no.")
+        value = os.environ[key]
+    except KeyError:
+        # KEY isn't set, default to `default`.
+        _value = default
+    else:
+        # KEY is set, convert it to True or False.
+        try:
+            _value = strtobool(value)
+        except ValueError:
+            # More values are supported, but let's keep the message simple.
+            raise ValueError("If set, {} must be yes or no.".format(key))
+    return _value
+
+_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
+_run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False)
 
 
 def slow(test_case):
@@ -33,6 +44,19 @@ def slow(test_case):
     return test_case
 
 
+def custom_tokenizers(test_case):
+    """
+    Decorator marking a test for a custom tokenizer.
+
+    Custom tokenizers require additional dependencies, and are skipped
+    by default. Set the RUN_CUSTOM_TOKENIZERS environment variable
+    to a truthy value to run them.
+    """
+    if not _run_custom_tokenizers:
+        test_case = unittest.skip("test of custom tokenizers")(test_case)
+    return test_case
+
+
 def require_torch(test_case):
     """
     Decorator marking a test that requires PyTorch.
@@ -59,6 +83,6 @@ def require_tf(test_case):
 
 if _torch_available:
     # Set the USE_CUDA environment variable to select a GPU.
-    torch_device = "cuda" if os.environ.get("USE_CUDA") else "cpu"
+    torch_device = "cuda" if parse_flag_from_env("USE_CUDA") else "cpu"
 else:
     torch_device = None
diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py
index b7c5046961..5377bd48cb 100644
--- a/transformers/tokenization_auto.py
+++ b/transformers/tokenization_auto.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import logging
 
 from .tokenization_bert import BertTokenizer
+from .tokenization_bert_japanese import BertJapaneseTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_ctrl import CTRLTokenizer
@@ -29,6 +30,8 @@ from .tokenization_roberta import RobertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
 from .tokenization_camembert import CamembertTokenizer
 from .tokenization_albert import AlbertTokenizer
+from .tokenization_t5 import T5Tokenizer
+from .tokenization_xlm_roberta import XLMRobertaTokenizer
 
 logger = logging.getLogger(__name__)
 
@@ -43,9 +46,11 @@ class AutoTokenizer(object):
 
         The tokenizer class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Tokenizer (T5 model)
             - contains `distilbert`: DistilBertTokenizer (DistilBert model)
             - contains `albert`: AlbertTokenizer (ALBERT model)
             - contains `camembert`: CamembertTokenizer (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model)
             - contains `roberta`: RobertaTokenizer (RoBERTa model)
             - contains `bert`: BertTokenizer (Bert model)
             - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
@@ -68,10 +73,13 @@ class AutoTokenizer(object):
 
         The tokenizer class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Tokenizer (T5 model)
             - contains `distilbert`: DistilBertTokenizer (DistilBert model)
             - contains `albert`: AlbertTokenizer (ALBERT model)
             - contains `camembert`: CamembertTokenizer (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model)
             - contains `roberta`: RobertaTokenizer (RoBERTa model)
+            - contains `bert-base-japanese`: BertJapaneseTokenizer (Bert model)
             - contains `bert`: BertTokenizer (Bert model)
             - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
             - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
@@ -84,6 +92,7 @@ class AutoTokenizer(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
 
@@ -106,18 +115,30 @@ class AutoTokenizer(object):
 
         Examples::
 
-            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
-            tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+            # Download vocabulary from S3 and cache.
+            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+
+            # Download vocabulary from S3 (user-uploaded) and cache.
+            tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
+
+            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
+            tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return T5Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
             return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'albert' in pretrained_model_name_or_path:
             return AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'camembert' in pretrained_model_name_or_path:
             return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'bert-base-japanese' in pretrained_model_name_or_path:
+            return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
             return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'openai-gpt' in pretrained_model_name_or_path:
@@ -134,4 +155,4 @@ class AutoTokenizer(object):
             return CTRLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
+                         "'xlm-roberta', 'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py
index ded5072e58..edc26d88cf 100644
--- a/transformers/tokenization_bert.py
+++ b/transformers/tokenization_bert.py
@@ -46,6 +46,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
         'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
         'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
         'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
+        'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
+        'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
     }
 }
 
@@ -65,6 +67,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'bert-base-cased-finetuned-mrpc': 512,
     'bert-base-german-dbmdz-cased': 512,
     'bert-base-german-dbmdz-uncased': 512,
+    'bert-base-finnish-cased-v1': 512,
+    'bert-base-finnish-uncased-v1': 512,
 }
 
 PRETRAINED_INIT_CONFIGURATION = {
@@ -83,6 +87,8 @@ PRETRAINED_INIT_CONFIGURATION = {
     'bert-base-cased-finetuned-mrpc': {'do_lower_case': False},
     'bert-base-german-dbmdz-cased': {'do_lower_case': False},
     'bert-base-german-dbmdz-uncased': {'do_lower_case': True},
+    'bert-base-finnish-cased-v1': {'do_lower_case': False},
+    'bert-base-finnish-uncased-v1': {'do_lower_case': True},
 }
 
 
@@ -113,12 +119,12 @@ class BertTokenizer(PreTrainedTokenizer):
 
     Args:
         vocab_file: Path to a one-wordpiece-per-line vocabulary file
-        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
+        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
         do_basic_tokenize: Whether to do basic tokenization before wordpiece.
         max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
             minimum of this value (if specified) and the underlying BERT model's sequence length.
         never_split: List of tokens which will never be split during tokenization. Only has an effect when
-            do_wordpiece_only=False
+            do_basic_tokenize=True
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py
new file mode 100644
index 0000000000..0ff45cbfe7
--- /dev/null
+++ b/transformers/tokenization_bert_japanese.py
@@ -0,0 +1,253 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import logging
+import os
+import six
+import unicodedata
+from io import open
+
+from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer, load_vocab
+from .tokenization_utils import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt",
+        'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt",
+        'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt",
+        'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt"
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'bert-base-japanese': 512,
+    'bert-base-japanese-whole-word-masking': 512,
+    'bert-base-japanese-char': 512,
+    'bert-base-japanese-char-whole-word-masking': 512
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    'bert-base-japanese': {
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'wordpiece'
+    },
+    'bert-base-japanese-whole-word-masking':{
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'wordpiece'
+    },
+    'bert-base-japanese-char': {
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'character'
+    },
+    'bert-base-japanese-char-whole-word-masking': {
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'character'
+    }
+}
+
+
+class BertJapaneseTokenizer(BertTokenizer):
+    """BERT tokenizer for Japanese text"""
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, do_lower_case=False,
+                 do_word_tokenize=True, do_subword_tokenize=True,
+                 word_tokenizer_type='basic', subword_tokenizer_type='wordpiece',
+                 never_split=None, unk_token='[UNK]', sep_token='[SEP]',
+                 pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
+        """Constructs a MecabBertTokenizer.
+
+        Args:
+            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
+            **do_lower_case**: (`optional`) boolean (default True)
+                Whether to lower case the input.
+                Only has an effect when do_basic_tokenize=True.
+            **do_word_tokenize**: (`optional`) boolean (default True)
+                Whether to do word tokenization.
+            **do_subword_tokenize**: (`optional`) boolean (default True)
+                Whether to do subword tokenization.
+            **word_tokenizer_type**: (`optional`) string (default "basic")
+                Type of word tokenizer.
+            **subword_tokenizer_type**: (`optional`) string (default "wordpiece")
+                Type of subword tokenizer.
+        """
+        super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
+                                            pad_token=pad_token, cls_token=cls_token,
+                                            mask_token=mask_token, **kwargs)
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+
+        self.do_word_tokenize = do_word_tokenize
+        if do_word_tokenize:
+            if word_tokenizer_type == 'basic':
+                self.word_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                                     never_split=never_split,
+                                                     tokenize_chinese_chars=False)
+            elif word_tokenizer_type == 'mecab':
+                self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case,
+                                                     never_split=never_split)
+            else:
+                raise ValueError(
+                    "Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
+
+        self.do_subword_tokenize = do_subword_tokenize
+        if do_subword_tokenize:
+            if subword_tokenizer_type == 'wordpiece':
+                self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab,
+                                                            unk_token=self.unk_token)
+            elif subword_tokenizer_type == 'character':
+                self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab,
+                                                            unk_token=self.unk_token)
+            else:
+                raise ValueError(
+                    "Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
+
+
+    def _tokenize(self, text):
+        if self.do_word_tokenize:
+            tokens = self.word_tokenizer.tokenize(text,
+                                                  never_split=self.all_special_tokens)
+        else:
+            tokens = [text]
+
+        if self.do_subword_tokenize:
+            split_tokens = [sub_token for token in tokens
+                            for sub_token in self.subword_tokenizer.tokenize(token)]
+        else:
+            split_tokens = tokens
+
+        return split_tokens
+
+
+class MecabTokenizer(object):
+    """Runs basic tokenization with MeCab morphological parser."""
+
+    def __init__(self, do_lower_case=False, never_split=None, normalize_text=True):
+        """Constructs a MecabTokenizer.
+
+        Args:
+            **do_lower_case**: (`optional`) boolean (default True)
+                Whether to lower case the input.
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes.
+                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+                List of token not to split.
+            **normalize_text**: (`optional`) boolean (default True)
+                Whether to apply unicode normalization to text before tokenization.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split if never_split is not None else []
+        self.normalize_text = normalize_text
+
+        import MeCab
+        self.mecab = MeCab.Tagger()
+
+    def tokenize(self, text, never_split=None, **kwargs):
+        """Tokenizes a piece of text."""
+        if self.normalize_text:
+            text = unicodedata.normalize('NFKC', text)
+
+        never_split = self.never_split + (never_split if never_split is not None else [])
+        tokens = []
+
+        if six.PY2:
+            mecab_output = self.mecab.parse(text.encode('utf-8')).decode('utf-8')
+        else:
+            mecab_output = self.mecab.parse(text)
+
+        cursor = 0
+        for line in mecab_output.split('\n'):
+            if line == 'EOS':
+                break
+
+            token, _ = line.split('\t')
+            token_start = text.index(token, cursor)
+            token_end = token_start + len(token)
+            if self.do_lower_case and token not in never_split:
+                token = token.lower()
+
+            tokens.append(token)
+            cursor = token_end
+
+        return tokens
+
+
+class CharacterTokenizer(object):
+    """Runs Character tokenziation."""
+
+    def __init__(self, vocab, unk_token, normalize_text=True):
+        """Constructs a CharacterTokenizer.
+
+        Args:
+            **vocab**:
+                Vocabulary object.
+            **unk_token**: str
+                A special symbol for out-of-vocabulary token.
+            **normalize_text**: (`optional`) boolean (default True)
+                Whether to apply unicode normalization to text before tokenization.
+        """
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.normalize_text = normalize_text
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into characters.
+
+        For example:
+            input = "apple"
+            output = ["a", "p", "p", "l", "e"]
+        Args:
+            text: A single token or whitespace separated tokens.
+                This should have already been passed through `BasicTokenizer`.
+        Returns:
+            A list of characters.
+        """
+        if self.normalize_text:
+            text = unicodedata.normalize('NFKC', text)
+
+        output_tokens = []
+        for i, char in enumerate(text):
+            if char not in self.vocab:
+                output_tokens.append(self.unk_token)
+                continue
+
+            output_tokens.append(char)
+
+        return output_tokens
diff --git a/transformers/tokenization_camembert.py b/transformers/tokenization_camembert.py
index b4091558e1..4c4615eb3d 100644
--- a/transformers/tokenization_camembert.py
+++ b/transformers/tokenization_camembert.py
@@ -22,6 +22,7 @@ from shutil import copyfile
 
 import sentencepiece as spm
 from transformers.tokenization_utils import PreTrainedTokenizer
+from .tokenization_xlnet import SPIECE_UNDERLINE
 
 logger = logging.getLogger(__name__)
 
@@ -145,6 +146,11 @@ class CamembertTokenizer(PreTrainedTokenizer):
             return self.fairseq_ids_to_tokens[index]
         return self.sp_model.IdToPiece(index - self.fairseq_offset)
 
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
+        return out_string
+
     def save_vocabulary(self, save_directory):
         """ Save the sentencepiece vocabulary (copy original file) and special tokens file
             to a directory.
diff --git a/transformers/tokenization_distilbert.py b/transformers/tokenization_distilbert.py
index f40bf2bd77..2f245d71dc 100644
--- a/transformers/tokenization_distilbert.py
+++ b/transformers/tokenization_distilbert.py
@@ -53,12 +53,12 @@ class DistilBertTokenizer(BertTokenizer):
 
     Args:
         vocab_file: Path to a one-wordpiece-per-line vocabulary file
-        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
+        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
         do_basic_tokenize: Whether to do basic tokenization before wordpiece.
         max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
             minimum of this value (if specified) and the underlying BERT model's sequence length.
         never_split: List of tokens which will never be split during tokenization. Only has an effect when
-            do_wordpiece_only=False
+            do_basic_tokenize=True
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py
new file mode 100644
index 0000000000..9fd37b67c0
--- /dev/null
+++ b/transformers/tokenization_t5.py
@@ -0,0 +1,176 @@
+# coding=utf-8
+# Copyright 2018 T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization class for model T5."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+import os
+import re
+import six
+from shutil import copyfile
+
+from .tokenization_utils import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+
+SPIECE_UNDERLINE = u'▁'
+
+####################################################
+# Mapping from the keyword arguments names of Tokenizer `__init__`
+# to file names for serializing Tokenizer instances
+####################################################
+VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
+
+####################################################
+# Mapping from the keyword arguments names of Tokenizer `__init__`
+# to pretrained vocabulary URL for all the model shortcut names.
+####################################################
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+    }
+}
+
+####################################################
+# Mapping from model shortcut names to max length of inputs
+####################################################
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    't5-small': 512,
+    't5-base': 512,
+    't5-large': 512,
+    't5-3b': 512,
+    't5-11b': 512,
+}
+
+class T5Tokenizer(PreTrainedTokenizer):
+    """
+        SentencePiece based tokenizer. Peculiarities:
+
+            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+            - `extra_ids` add a number of extra ids added to the end of the vocabulary for use as sentinels.
+                These tokens are accessible as `<extra_id_{%d}>` where `{%d}` is a number between 0 and extra_ids-1.
+                Extra tokens are indexed from the end of the vocabulary up to beginnning (<extra_id_0> is the last token in the vocabulary)
+                (like in T5 preprocessing
+                see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, eos_token="</s>", unk_token="<unk>",
+                 pad_token="<pad>", extra_ids=100, additional_special_tokens=None, **kwargs):
+        # Add extra_ids to the special token list
+        if extra_ids > 0:
+            if additional_special_tokens is None:
+                additional_special_tokens = []
+            additional_special_tokens.extend([u"<extra_id_{}>".format(i) for i in range(extra_ids)])
+
+        super(T5Tokenizer, self).__init__(eos_token=eos_token, unk_token=unk_token,
+                                          pad_token=pad_token, additional_special_tokens=additional_special_tokens,
+                                          **kwargs)
+
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning("You need to install SentencePiece to use T5Tokenizer:"
+                           "https://github.com/google/sentencepiece"
+                           "pip install sentencepiece")
+
+        self.vocab_file = vocab_file
+        self._extra_ids = extra_ids
+
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+
+    @property
+    def vocab_size(self):
+        return self.sp_model.get_piece_size() + self._extra_ids
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
+                           "pip install sentencepiece")
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(self.vocab_file)
+
+    def _tokenize(self, text, return_unicode=True, sample=False):
+        """ Take as input a string and return a list of strings (tokens) for words/sub-words
+        """
+        if not sample:
+            pieces = self.sp_model.EncodeAsPieces(text)
+        else:
+            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
+
+        # convert back to unicode for py2
+        if six.PY2 and return_unicode:
+            ret_pieces = []
+            for piece in pieces:
+                if isinstance(piece, str):
+                    piece = piece.decode('utf-8')
+                ret_pieces.append(piece)
+            pieces = ret_pieces
+
+        return pieces
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        if token.startswith(u"<extra_id_"):
+            l = re.match(r'<extra_id_(\d+)>', token)
+            num = int(l.group(1))
+            return self.vocab_size - num - 1
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index, return_unicode=True):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        if index < self.sp_model.get_piece_size():
+            token = self.sp_model.IdToPiece(index)
+        else:
+            token = u"<extra_id_{}>".format(self.vocab_size - 1 - index)
+        if six.PY2 and return_unicode and isinstance(token, str):
+            token = token.decode('utf-8')
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = self.sp_model.decode_pieces(tokens)
+        return out_string
+
+    def save_vocabulary(self, save_directory):
+        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
+            to a directory.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 2e0d6caef2..39dded7c92 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -25,7 +25,7 @@ import itertools
 import re
 from io import open
 
-from .file_utils import cached_path, is_tf_available, is_torch_available
+from .file_utils import cached_path, is_remote_url, hf_bucket_url, is_tf_available, is_torch_available
 
 if is_tf_available():
     import tensorflow as tf
@@ -226,7 +226,7 @@ class PreTrainedTokenizer(object):
 
         self.max_len = max_len if max_len is not None else int(1e12)
 
-        # Padding side is right by default and over-riden in subclsses. If specified in the kwargs, it is changed.
+        # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
         self.padding_side = kwargs.pop('padding_side', self.padding_side)
         
         # Added tokens
@@ -255,6 +255,7 @@ class PreTrainedTokenizer(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
 
@@ -282,6 +283,9 @@ class PreTrainedTokenizer(object):
             # Download vocabulary from S3 and cache.
             tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 
+            # Download vocabulary from S3 (user-uploaded) and cache.
+            tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
+
             # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
             tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
 
@@ -327,12 +331,15 @@ class PreTrainedTokenizer(object):
                 if os.path.isdir(pretrained_model_name_or_path):
                     # If a directory is provided we look for the standard filenames
                     full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
-                else:
+                    if not os.path.exists(full_file_name):
+                        logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+                        full_file_name = None
+                elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                     # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
                     full_file_name = pretrained_model_name_or_path
-                if not os.path.exists(full_file_name):
-                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
-                    full_file_name = None
+                else:
+                    full_file_name = hf_bucket_url(pretrained_model_name_or_path, postfix=file_name)
+                
                 vocab_files[file_id] = full_file_name
 
             # Look for the additional tokens files
@@ -427,7 +434,11 @@ class PreTrainedTokenizer(object):
                     init_kwargs[key] = value
 
         # Instantiate tokenizer.
-        tokenizer = cls(*init_inputs, **init_kwargs)
+        try:
+            tokenizer = cls(*init_inputs, **init_kwargs)
+        except OSError:
+            OSError("Unable to load vocabulary from file. "
+                    "Please check that the provided vocabulary is accessible and not corrupted.")
 
         # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
         tokenizer.init_inputs = init_inputs
@@ -628,13 +639,14 @@ class PreTrainedTokenizer(object):
             Take care of added tokens.
 
             text: The sequence to be encoded.
-            return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False).
             **kwargs: passed to the child `self.tokenize()` method
         """
+        all_special_tokens = self.all_special_tokens
+
         def lowercase_text(t):
             # convert non-special tokens to lowercase
-            escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
-            pattern = r'(^' + r'|'.join(escaped_special_toks) + r')|' + \
+            escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens]
+            pattern = r'(' + r'|'.join(escaped_special_toks) + r')|' + \
                       r'(.+?)'
             return re.sub(
                 pattern,
@@ -663,7 +675,7 @@ class PreTrainedTokenizer(object):
             return result
 
         def split_on_tokens(tok_list, text):
-            if not text:
+            if not text.strip():
                 return []
             if not tok_list:
                 return self._tokenize(text, **kwargs)
@@ -674,17 +686,17 @@ class PreTrainedTokenizer(object):
                 tokenized_text = []
                 for sub_text in text_list:
                     if sub_text not in self.added_tokens_encoder \
-                            and sub_text not in self.all_special_tokens:
+                            and sub_text not in all_special_tokens:
                         tokenized_text += split_on_token(tok, sub_text)
                     else:
                         tokenized_text += [sub_text]
                 text_list = tokenized_text
 
             return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) if token not \
-                    in self.added_tokens_encoder and token not in self.all_special_tokens \
+                    in self.added_tokens_encoder and token not in all_special_tokens \
                     else [token] for token in tokenized_text)))
 
-        added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
+        added_tokens = list(self.added_tokens_encoder.keys()) + all_special_tokens
         tokenized_text = split_on_tokens(added_tokens, text)
         return tokenized_text
 
@@ -878,6 +890,92 @@ class PreTrainedTokenizer(object):
                                       return_overflowing_tokens=return_overflowing_tokens,
                                       return_special_tokens_mask=return_special_tokens_mask)
 
+    def batch_encode_plus(self,
+                          batch_text_or_text_pairs=None,
+                          add_special_tokens=False,
+                          max_length=None,
+                          stride=0,
+                          truncation_strategy='longest_first',
+                          return_tensors=None,
+                          return_input_lengths=False,
+                          return_attention_masks=False,
+                          **kwargs):
+        """
+        Returns a dictionary containing the encoded sequence or sequence pair and additional information:
+        the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
+
+        Args:
+            batch_text_or_text_pairs: Batch of sequences or pair of sequences to be encoded.
+                This can be a list of string/string-sequences/int-sequences or a list of pair of
+                string/string-sequences/int-sequence (see details in encode_plus)
+            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+                to their model.
+            max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
+                If there are overflowing tokens, those will be added to the returned dictionary`
+            stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
+                from the main sequence returned. The value of this argument defines the number of additional tokens.
+            truncation_strategy: string selected in the following options:
+                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+                    starting from the longest one at each token (when there is a pair of input sequences)
+                - 'only_first': Only truncate the first sequence
+                - 'only_second': Only truncate the second sequence
+                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
+                or PyTorch torch.Tensor instead of a list of python integers.
+            **kwargs: passed to the `self.tokenize()` method
+        """
+        batch_outputs = {}
+        for ids_or_pair_ids in batch_text_or_text_pairs:
+            if isinstance(ids_or_pair_ids, (list, tuple)):
+                assert len(ids_or_pair_ids) == 2
+                ids, pair_ids = ids_or_pair_ids
+            else:
+                ids, pair_ids = ids_or_pair_ids, None
+            outputs = self.encode_plus(ids, pair_ids, add_special_tokens=add_special_tokens, max_length=max_length,
+                                       stride=stride, truncation_strategy=truncation_strategy, return_tensors=None)
+
+            # Append the non-padded length to the output
+            if return_input_lengths:
+                outputs['input_len'] = len(outputs['input_ids'])
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        # Compute longest sequence size
+        max_seq_len = max(map(len, batch_outputs['input_ids']))
+
+        if return_attention_masks:
+            # Allow the model to not give any special attention to padded input
+            batch_outputs['attention_mask'] = [[0] * len(v) for v in batch_outputs['input_ids']]
+
+        if return_tensors is not None:
+
+            # Do the tensor conversion in batch
+            for key, value in batch_outputs.items():
+
+                padded_value = value
+                if key != 'input_len':
+                    # Padding handle
+                    padded_value = [v + [self.pad_token_id if key == 'input_ids' else 1] * (max_seq_len - len(v)) for v in padded_value]
+
+                if return_tensors == 'tf' and is_tf_available():
+                    batch_outputs[key] = tf.constant(padded_value)
+                elif return_tensors == 'pt' and is_torch_available():
+                    batch_outputs[key] = torch.tensor(padded_value)
+                elif return_tensors is not None:
+                    logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))
+
+        # encoder_attention_mask requires 1 for real token, 0 for padding, just invert value
+        if return_attention_masks:
+            if is_tf_available():
+                batch_outputs['attention_mask'] = tf.abs(batch_outputs['attention_mask'] - 1)
+            else:
+                batch_outputs['attention_mask'] = torch.abs(batch_outputs['attention_mask'] - 1)
+
+        return batch_outputs
+
     def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0,
                           truncation_strategy='longest_first',
                           pad_to_max_length=False,
@@ -917,7 +1015,7 @@ class PreTrainedTokenizer(object):
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
-            return_attention_mask: (optional) Set to False to avoir returning attention mask (default True)
+            return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
             return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
             return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
 
@@ -962,24 +1060,13 @@ class PreTrainedTokenizer(object):
         if add_special_tokens:
             sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
             token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
-            special_tokens_mask = self.get_special_tokens_mask(ids, pair_ids)
         else:
             sequence = ids + pair_ids if pair else ids
             token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
-            special_tokens_mask = [0] * (len(ids) + (len(pair_ids) if pair else 0))
+
         if return_special_tokens_mask:
             encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
 
-        # Prepare inputs as tensors if asked
-        if return_tensors == 'tf' and is_tf_available():
-            sequence = tf.constant([sequence])
-            token_type_ids = tf.constant([token_type_ids])
-        elif return_tensors == 'pt' and is_torch_available():
-            sequence = torch.tensor([sequence])
-            token_type_ids = torch.tensor([token_type_ids])
-        elif return_tensors is not None:
-            logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))
-
         encoded_inputs["input_ids"] = sequence
         if return_token_type_ids:
             encoded_inputs["token_type_ids"] = token_type_ids
@@ -1003,7 +1090,7 @@ class PreTrainedTokenizer(object):
         )
 
         if pad_to_max_length and max_length is None and self.max_len > 10000:
-            logger.warning("Sequence can't be padded as the maximum  ")
+            logger.warning("Sequence can't be padded as no maximum length is specified and the model maximum length is too high.")
 
         if needs_to_be_padded:
             difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])
@@ -1016,10 +1103,9 @@ class PreTrainedTokenizer(object):
                 if return_special_tokens_mask:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
-
             elif self.padding_side == 'left':
                 if return_attention_mask:
-                    encoded_inputs["attention_mask"] =  [0] * difference + [1] * len(encoded_inputs["input_ids"])
+                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
                 if return_token_type_ids:
                     encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"]
                 if return_special_tokens_mask:
@@ -1031,7 +1117,26 @@ class PreTrainedTokenizer(object):
             
         elif return_attention_mask:
             encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
-            
+
+        # Prepare inputs as tensors if asked
+        if return_tensors == 'tf' and is_tf_available():
+            encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]])
+            encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]])
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]])
+
+        elif return_tensors == 'pt' and is_torch_available():
+            encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])
+            encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])
+        elif return_tensors is not None:
+            logger.warning(
+                "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
+                    return_tensors))
+
         return encoded_inputs
 
     def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
@@ -1122,6 +1227,7 @@ class PreTrainedTokenizer(object):
                 return self._convert_id_to_token(ids)
         tokens = []
         for index in ids:
+            index = int(index)
             if skip_special_tokens and index in self.all_special_ids:
                 continue
             if index in self.added_tokens_decoder:
@@ -1165,12 +1271,12 @@ class PreTrainedTokenizer(object):
                 if current_sub_text:
                     sub_texts.append(self.convert_tokens_to_string(current_sub_text))
                     current_sub_text = []
-                sub_texts.append(" " + token)
+                sub_texts.append(token)
             else:
                 current_sub_text.append(token)
         if current_sub_text:
             sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-        text = ''.join(sub_texts)
+        text = ' '.join(sub_texts)
 
         if clean_up_tokenization_spaces:
             clean_text = self.clean_up_tokenization(text)
diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
index 6c9f8e5e5c..8def80bec4 100644
--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -549,6 +549,10 @@ class XLMTokenizer(PreTrainedTokenizer):
                                            additional_special_tokens=additional_special_tokens,
                                            **kwargs)
 
+
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens 
+
         # cache of sm.MosesPunctNormalizer instance
         self.cache_moses_punct_normalizer = dict()
         # cache of sm.MosesTokenizer instance
diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py
new file mode 100644
index 0000000000..adbc8cd6c7
--- /dev/null
+++ b/transformers/tokenization_xlm_roberta.py
@@ -0,0 +1,182 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+""" Tokenization classes for XLM-RoBERTa model."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+import os
+from shutil import copyfile
+
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+from .tokenization_xlnet import SPIECE_UNDERLINE
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+    'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model",
+    'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-sentencepiece.bpe.model",
+    'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-sentencepiece.bpe.model",
+    'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-sentencepiece.bpe.model",
+    'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-sentencepiece.bpe.model",
+    'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-sentencepiece.bpe.model",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'xlm-roberta-base': 512,
+    'xlm-roberta-large': 512,
+    'xlm-roberta-large-finetuned-conll02-dutch': 512,
+    'xlm-roberta-large-finetuned-conll02-spanish': 512,
+    'xlm-roberta-large-finetuned-conll03-english': 512,
+    'xlm-roberta-large-finetuned-conll03-german': 512,
+}
+
+class XLMRobertaTokenizer(PreTrainedTokenizer):
+    """
+        Adapted from RobertaTokenizer and XLNetTokenizer
+        SentencePiece based tokenizer. Peculiarities:
+
+            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, bos_token="<s>", eos_token="</s>", sep_token="</s>",
+                 cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>',
+                 **kwargs):
+        super(XLMRobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
+                                                  sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
+                                                  mask_token=mask_token,
+                                                  **kwargs)
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+
+        # Original fairseq vocab and spm vocab must be "aligned":
+        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
+        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
+        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
+        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
+
+        # Mimic fairseq token-to-id alignment for the first 4 token
+        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
+
+        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
+        self.fairseq_offset = 1
+
+        self.fairseq_tokens_to_ids['<mask>'] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A RoBERTa sequence has the following format:
+            single sequence: <s> X </s>
+            pair of sequences: <s> A </s></s> B </s>
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+
+        Args:
+            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+                for sequence pairs
+            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                special tokens for the model
+
+        Returns:
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError("You should not supply a second sequence if the provided sequence of "
+                                 "ids is already formated with special tokens for the model.")
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A RoBERTa sequence pair mask has the following format:
+        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence
+
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + len(self.fairseq_tokens_to_ids)
+
+    def _tokenize(self, text):
+        return self.sp_model.EncodeAsPieces(text)
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        return self.sp_model.PieceToId(token) + self.fairseq_offset
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory):
+        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
+            to a directory.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
diff --git a/try.py b/try.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/utils/link_tester.py b/utils/link_tester.py
new file mode 100644
index 0000000000..fe3990d28c
--- /dev/null
+++ b/utils/link_tester.py
@@ -0,0 +1,79 @@
+""" Link tester.
+
+This little utility reads all the python files in the repository,
+scans for links pointing to S3 and tests the links one by one. Raises an error
+at the end of the scan if at least one link was reported broken.
+"""
+import os
+import re
+import sys
+
+import requests
+
+
+REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1"""
+
+
+def list_python_files_in_repository():
+    """ List all python files in the repository.
+
+    This function assumes that the script is executed in the root folder.
+    """
+    source_code_files = []
+    for path, subdirs, files in os.walk("."):
+        if "templates" in path:
+            continue
+        for name in files:
+            if ".py" in name and ".pyc" not in name:
+                path_to_files = os.path.join(path, name)
+                source_code_files.append(path_to_files)
+
+    return source_code_files
+
+
+def find_all_links(file_paths):
+    links = []
+    for path in file_paths:
+        links += scan_code_for_links(path)
+
+    return links
+
+
+def scan_code_for_links(source):
+    """ Scans the file to find links using a regular expression.
+    Returns a list of links.
+    """
+    with open(source, 'r') as content:
+        content = content.read()
+        raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
+        links = [prefix + suffix for _, prefix, suffix in raw_links]
+
+    return links
+
+
+def check_all_links(links):
+    """ Check that the provided links are valid.
+
+    Links are considered valid if a HEAD request to the server
+    returns a 200 status code.
+    """
+    broken_links = []
+    for link in links:
+        head = requests.head(link)
+        if head.status_code != 200:
+            broken_links.append(link)
+
+    return broken_links
+
+
+if __name__ == "__main__":
+    file_paths = list_python_files_in_repository()
+    links = find_all_links(file_paths)
+    broken_links = check_all_links(links)
+    print("Looking for broken links to pre-trained models/configs/tokenizers...")
+    if broken_links:
+        print("The following links did not respond:")
+        for link in broken_links:
+            print("- {}".format(link))
+        sys.exit(1)
+    print("All links are ok.")