Merge branch 'master' into master

2019-10-03 22:43:09 +00:00
parent 4b543c3007 ecc4f1bdfa
commit d3f24dfad7
163 changed files with 13371 additions and 1766 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,33 +1,77 @@
 version: 2
 jobs:
-    build_py3:
+    build_py3_torch_and_tf:
-        working_directory: ~/pytorch-transformers
+        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
            - run: sudo pip install torch
            - run: sudo pip install tensorflow==2.0.0-rc0
            - run: sudo pip install --progress-bar off .
            - run: sudo pip install pytest codecov pytest-cov
            - run: sudo pip install tensorboardX scikit-learn
-            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
+            - run: python -m pytest -sv ./transformers/tests/ --cov
            - run: codecov
    build_py3_torch:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
            - run: sudo pip install torch
            - run: sudo pip install --progress-bar off .
            - run: sudo pip install pytest codecov pytest-cov
            - run: sudo pip install tensorboardX scikit-learn
            - run: python -m pytest -sv ./transformers/tests/ --cov
            - run: python -m pytest -sv ./examples/
            - run: codecov
-    build_py2:
+    build_py3_tf:
-        working_directory: ~/pytorch-transformers
+        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
            - run: sudo pip install tensorflow==2.0.0-rc0
            - run: sudo pip install --progress-bar off .
            - run: sudo pip install pytest codecov pytest-cov
            - run: sudo pip install tensorboardX scikit-learn
            - run: python -m pytest -sv ./transformers/tests/ --cov
            - run: codecov
    build_py2_torch:
        working_directory: ~/transformers
        resource_class: large
        parallelism: 1
        docker:
            - image: circleci/python:2.7
        steps:
            - checkout
            - run: sudo pip install torch
            - run: sudo pip install --progress-bar off .
            - run: sudo pip install pytest codecov pytest-cov
-            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
+            - run: python -m pytest -sv ./transformers/tests/ --cov
            - run: codecov
    build_py2_tf:
        working_directory: ~/transformers
        resource_class: large
        parallelism: 1
        docker:
            - image: circleci/python:2.7
        steps:
            - checkout
            - run: sudo pip install tensorflow==2.0.0-rc0
            - run: sudo pip install --progress-bar off .
            - run: sudo pip install pytest codecov pytest-cov
            - run: python -m pytest -sv ./transformers/tests/ --cov
            - run: codecov
    deploy_doc:
-        working_directory: ~/pytorch-transformers
+        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        steps:
@@ -37,7 +81,6 @@ jobs:
            - checkout
            - run: sudo pip install --progress-bar off -r docs/requirements.txt
            - run: sudo pip install --progress-bar off -r requirements.txt
            - run: cd docs/source && ln -s ../../examples/README.md examples.md && cd -
            - run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
 workflow_filters: &workflow_filters
    filters:
@@ -48,6 +91,9 @@ workflows:
    version: 2
    build_and_test:
        jobs:
-            - build_py3
+            - build_py3_torch_and_tf
-            - build_py2
+            - build_py3_torch
            - build_py3_tf
            - build_py2_torch
            - build_py2_tf
            - deploy_doc: *workflow_filters
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,5 @@
 [run]
-source=pytorch_transformers
+source=transformers
 omit =
    # skip convertion scripts from testing for now
    */convert_*
--- a/.github/ISSUE_TEMPLATE/migration.md
+++ b/.github/ISSUE_TEMPLATE/migration.md
@@ -1,6 +1,6 @@
 ---
 name: "\U0001F4DA Migration from PyTorch-pretrained-Bert"
-about: Report a problem when migrating from PyTorch-pretrained-Bert to PyTorch-Transformers
+about: Report a problem when migrating from PyTorch-pretrained-Bert to Transformers
 ---
 ## 📚 Migration
--- a/.gitignore
+++ b/.gitignore
@@ -130,5 +130,5 @@ runs
 examples/runs
 # data
-data
+/data
 serialization_dir
--- a/README.md
+++ b/README.md
@@ -1,47 +1,86 @@
-# 👾 PyTorch-Transformers
+<p align="center">
    <br>
    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
    <br>
 <p>
 <p align="center">
    <a href="https://circleci.com/gh/huggingface/transformers">
        <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/master">
    </a>
    <a href="https://github.com/huggingface/transformers/blob/master/LICENSE">
        <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
    </a>
    <a href="https://huggingface.co/transformers/index.html">
        <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/transformers/index.html.svg?down_color=red&down_message=offline&up_message=online">
    </a>
    <a href="https://github.com/huggingface/transformers/releases">
        <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
    </a>
 </p>
-[![CircleCI](https://circleci.com/gh/huggingface/pytorch-transformers.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-transformers)
+<h3 align="center">
 <p>State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
 </h3>
-PyTorch-Transformers (formerly known as `pytorch-pretrained-bert`) is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
+🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
-The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
+### Features
-1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+- As easy to use as pytorch-transformers
-2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+- As powerful and concise as Keras
-3. **[GPT-2](https://blog.openai.com/better-language-models/)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+- High performance on NLU and NLG tasks
-4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+- Low barrier to entry for educators and practitioners
-5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+
-6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+State-of-the-art NLP for everyone
-7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+- Deep learning researchers
-8. **[DistilBERT](https://github.com/huggingface/pytorch-transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5
+- Hands-on practitioners
-) by Victor Sanh, Lysandre Debut and Thomas Wolf.
+- AI/ML/NLP teachers and educators
 Lower compute costs, smaller carbon footprint
 - Researchers can share trained models instead of always retraining
 - Practitioners can reduce compute time and production costs
 - 8 architectures with over 30 pretrained models, some in more than 100 languages
 Choose the right framework for every part of a model's lifetime
 - Train state-of-the-art models in 3 lines of code
 - Deep interoperability between TensorFlow 2.0 and PyTorch models
 - Move a single model between TF2.0/PyTorch frameworks at will
 - Seamlessly pick the right framework for training, evaluation, production
 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/pytorch-transformers/examples.html).
 | Section | Description |
 |-|-|
 | [Installation](#installation) | How to install the package |
 | [Model architectures](#model-architectures) | Architectures (with pretrained weights) |
 | [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
 | [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
 | [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
 | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
-| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers |
+| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
-| [Documentation](https://huggingface.co/pytorch-transformers/) | Full API documentation and more |
+| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
 | [Documentation](https://huggingface.co/transformers/) | Full API documentation and more |
 ## Installation
-This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.0.0+
+This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+), PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
 ### With pip
-PyTorch-Transformers can be installed by pip as follows:
+First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
 Please refere to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
 When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
 ```bash
-pip install pytorch-transformers
+pip install transformers
 ```
 ### From source
-Clone the repository and run:
+Here also, you first need to install one of, or both, TensorFlow 2.0 and PyTorch.
 Please refere to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
 When TensorFlow 2.0 and/or PyTorch has been installed, you can install from source by cloning the repository and running:
 ```bash
 pip install [--editable] .
@@ -49,14 +88,16 @@ pip install [--editable] .
 ### Tests
-A series of tests is included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/pytorch-transformers/tree/master/examples).
+A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
 These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
 Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
 You can run the tests from the root of the cloned repository with the commands:
 ```bash
-python -m pytest -sv ./pytorch_transformers/tests/
+python -m pytest -sv ./transformers/tests/
 python -m pytest -sv ./examples/
 ```
@@ -66,8 +107,22 @@ You should check out our [`swift-coreml-transformers`](https://github.com/huggin
 It contains an example of a conversion script from a Pytorch trained Transformer model (here, `GPT-2`) to a CoreML model that runs on iOS devices.
-At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
+At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models to productizing them in CoreML, or prototype a model or an app in CoreML then research its hyperparameters or architecture from TensorFlow 2.0 and/or PyTorch. Super exciting!
-or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
+
 ## Model architectures
 🤗 Transformers currently provides 8 NLU/NLG architectures:
 1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 3. **[GPT-2](https://blog.openai.com/better-language-models/)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation).
 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
 ## Online demo
@@ -80,22 +135,25 @@ You can use it to experiment with completions generated by `GPT2Model`, `Transfo
 ## Quick tour
-Let's do a very quick overview of PyTorch-Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/pytorch-transformers/).
+Let's do a very quick overview of the model architectures in 🤗 Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/transformers/).
 ```python
 import torch
-from pytorch_transformers import *
+from transformers import *
-# PyTorch-Transformers has a unified API
+# Transformers has a unified API
-# for 7 transformer architectures and 30 pretrained weights.
+# for 8 transformer architectures and 30 pretrained weights.
 #          Model          | Tokenizer          | Pretrained weights shortcut
-MODELS = [(BertModel,       BertTokenizer,      'bert-base-uncased'),
+MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
-          (OpenAIGPTModel,  OpenAIGPTTokenizer, 'openai-gpt'),
+          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
-          (GPT2Model,       GPT2Tokenizer,      'gpt2'),
+          (GPT2Model,       GPT2Tokenizer,       'gpt2'),
-          (TransfoXLModel,  TransfoXLTokenizer, 'transfo-xl-wt103'),
+          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
-          (XLNetModel,      XLNetTokenizer,     'xlnet-base-cased'),
+          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
-          (XLMModel,        XLMTokenizer,       'xlm-mlm-enfr-1024'),
+          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
-          (RobertaModel,    RobertaTokenizer,   'roberta-base')]
+          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
          (RobertaModel,    RobertaTokenizer,    'roberta-base')]
 # To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`
 # Let's encode some text in a sequence of hidden-states using each model:
 for model_class, tokenizer_class, pretrained_weights in MODELS:
@@ -121,24 +179,71 @@ for model_class in BERT_MODEL_CLASSES:
    # Load pretrained model/tokenizer
    model = model_class.from_pretrained('bert-base-uncased')
-# Models can return full list of hidden-states & attentions weights at each layer
+    # Models can return full list of hidden-states & attentions weights at each layer
-model = model_class.from_pretrained(pretrained_weights,
+    model = model_class.from_pretrained(pretrained_weights,
-                                    output_hidden_states=True,
+                                        output_hidden_states=True,
-                                    output_attentions=True)
+                                        output_attentions=True)
-input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
+    input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
-all_hidden_states, all_attentions = model(input_ids)[-2:]
+    all_hidden_states, all_attentions = model(input_ids)[-2:]
-# Models are compatible with Torchscript
+    # Models are compatible with Torchscript
-model = model_class.from_pretrained(pretrained_weights, torchscript=True)
+    model = model_class.from_pretrained(pretrained_weights, torchscript=True)
-traced_model = torch.jit.trace(model, (input_ids,))
+    traced_model = torch.jit.trace(model, (input_ids,))
-# Simple serialization for models and tokenizers
+    # Simple serialization for models and tokenizers
-model.save_pretrained('./directory/to/save/')  # save
+    model.save_pretrained('./directory/to/save/')  # save
-model = model_class.from_pretrained('./directory/to/save/')  # re-load
+    model = model_class.from_pretrained('./directory/to/save/')  # re-load
-tokenizer.save_pretrained('./directory/to/save/')  # save
+    tokenizer.save_pretrained('./directory/to/save/')  # save
-tokenizer = tokenizer_class.from_pretrained('./directory/to/save/')  # re-load
+    tokenizer = BertTokenizer.from_pretrained('./directory/to/save/')  # re-load
-# SOTA examples for GLUE, SQUAD, text generation...
+    # SOTA examples for GLUE, SQUAD, text generation...
 ```
 ## Quick tour TF 2.0 training and PyTorch interoperability
 Let's do a quick example of how a TensorFlow 2.0 model can be trained in 12 lines of code with 🤗 Transformers and then loaded in PyTorch for fast inspection/tests.
 ```python
 import tensorflow as tf
 import tensorflow_datasets
 from transformers import *
 # Load dataset, tokenizer, model from pretrained model/vocabulary
 tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
 model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
 data = tensorflow_datasets.load('glue/mrpc')
 # Prepare dataset for GLUE as a tf.data.Dataset instance
 train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
 valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
 train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
 valid_dataset = valid_dataset.batch(64)
 # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
 optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
 loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
 model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
 # Train and evaluate using tf.keras.Model.fit()
 history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
                    validation_data=valid_dataset, validation_steps=7)
 # Load the TensorFlow model in PyTorch for inspection
 model.save_pretrained('./save/')
 pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
 # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
 sentence_0 = "This research was consistent with his findings."
 sentence_1 = "His findings were compatible with this research."
 sentence_2 = "His findings were not compatible with this research."
 inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
 inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
 pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
 pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
 print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
 print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
 ```
 ## Quick tour of the fine-tuning/usage scripts
@@ -288,7 +393,7 @@ This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-s
 ### `run_generation.py`: Text generation with GPT, GPT-2, Transformer-XL and XLNet
 A conditional generation script is also included to generate text from a prompt.
-The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by Aman Rusia to get high quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
+The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by Aman Rusia to get high-quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
 Here is how to run the script with the small version of OpenAI GPT-2 model:
@@ -299,19 +404,32 @@ python ./examples/run_generation.py \
    --model_name_or_path=gpt2 \
 ```
-## Migrating from pytorch-pretrained-bert to pytorch-transformers
+## Migrating from pytorch-transformers to transformers
-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers`
+Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
 ### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
 To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
 If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
 If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
 ## Migrating from pytorch-pretrained-bert to transformers
 Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`.
 ### Models always output `tuples`
-The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
-The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
+The exact content of the tuples for each model is detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
-Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model:
+Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
 ```python
 # Let's load our model
@@ -320,11 +438,11 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 # If you used to have this line in pytorch-pretrained-bert:
 loss = model(input_ids, labels=labels)
-# Now just use this line in pytorch-transformers to extract the loss from the output tuple:
+# Now just use this line in transformers to extract the loss from the output tuple:
 outputs = model(input_ids, labels=labels)
 loss = outputs[0]
-# In pytorch-transformers you can also have access to the logits:
+# In transformers you can also have access to the logits:
 loss, logits = outputs[:2]
 # And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
@@ -333,13 +451,17 @@ outputs = model(input_ids, labels=labels)
 loss, logits, attentions = outputs
 ```
 ### Using hidden states
 By enabling the configuration option `output_hidden_states`, it was possible to retrieve the last hidden states of the encoder. In `pytorch-transformers` as well as `transformers` the return value has changed slightly: `all_hidden_states` now also includes the hidden state of the embeddings in addition to those of the encoding layers. This allows users to easily access the embeddings final state.
 ### Serialization
-Breaking change in the `from_pretrained()`method:
+Breaking change in the `from_pretrained()` method:
 1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
-2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/pytorch-transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
+2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
 Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
@@ -396,7 +518,7 @@ for batch in train_data:
    loss.backward()
    optimizer.step()
-### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
+### In Transformers, optimizer and schedules are splitted and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
 scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
 ### and used like this:
@@ -411,4 +533,4 @@ for batch in train_data:
 ## Citation
-At the moment, there is no paper associated to PyTorch-Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
+At the moment, there is no paper associated with Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -2,6 +2,6 @@ FROM pytorch/pytorch:latest
 RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
-RUN pip install pytorch_transformers
+RUN pip install transformers
 WORKDIR /workspace
--- a/docs/README.md
+++ b/docs/README.md
@@ -34,11 +34,11 @@ pip install recommonmark
 ## Building the documentation
-Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the followig 
+Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following 
 command to generate it:
 ```bash
-ln -s ../../examples/README.md source/examples.md
+ln -s ../../examples/README.md examples.md
 ```
 Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -26,4 +26,7 @@ sphinxcontrib-jsmath==1.0.1
 sphinxcontrib-qthelp==1.0.2
 sphinxcontrib-serializinghtml==1.1.3
 urllib3==1.25.3
-sphinx-markdown-tables==0.0.9
+sphinx-markdown-tables==0.0.9
 numpy==1.17.2
 tensorflow==2.0.0rc2
 torch==1.2.0
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
--- a/docs/source/bertology.rst
+++ b/docs/source/bertology.rst
@@ -15,4 +15,4 @@ In order to help this new field develop, we have included a few additional featu
 * accessing all the attention weights for each head of BERT/GPT/GPT-2,
 * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
-To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
+To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -19,14 +19,14 @@ sys.path.insert(0, os.path.abspath('../..'))
 # -- Project information -----------------------------------------------------
-project = u'pytorch-transformers'
+project = u'transformers'
 copyright = u'2019, huggingface'
 author = u'huggingface'
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'1.2.0'
+release = u'2.0.0'
 # -- General configuration ---------------------------------------------------
@@ -109,7 +109,7 @@ html_static_path = ['_static']
 # -- Options for HTMLHelp output ---------------------------------------------
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'pytorch-transformersdoc'
+htmlhelp_basename = 'transformersdoc'
 # -- Options for LaTeX output ------------------------------------------------
@@ -136,7 +136,7 @@ latex_elements = {
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'pytorch-transformers.tex', u'pytorch-transformers Documentation',
+    (master_doc, 'transformers.tex', u'transformers Documentation',
     u'huggingface', 'manual'),
 ]
@@ -146,7 +146,7 @@ latex_documents = [
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    (master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
+    (master_doc, 'transformers', u'transformers Documentation',
     [author], 1)
 ]
@@ -157,8 +157,8 @@ man_pages = [
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
+    (master_doc, 'transformers', u'transformers Documentation',
-     author, 'pytorch-transformers', 'One line description of project.',
+     author, 'transformers', 'One line description of project.',
     'Miscellaneous'),
 ]
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -6,7 +6,7 @@ A command-line interface is provided to convert original Bert/GPT/GPT-2/Transfor
 BERT
 ^^^^
-You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
+You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
 This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).
@@ -20,7 +20,7 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas
   export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
-   pytorch_transformers bert \
+   transformers bert \
     $BERT_BASE_DIR/bert_model.ckpt \
     $BERT_BASE_DIR/bert_config.json \
     $BERT_BASE_DIR/pytorch_model.bin
@@ -36,7 +36,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
   export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
-   pytorch_transformers gpt \
+   transformers gpt \
     $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
     $PYTORCH_DUMP_OUTPUT \
     [OPENAI_GPT_CONFIG]
@@ -50,7 +50,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT-2 mode
   export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
-   pytorch_transformers gpt2 \
+   transformers gpt2 \
     $OPENAI_GPT2_CHECKPOINT_PATH \
     $PYTORCH_DUMP_OUTPUT \
     [OPENAI_GPT2_CONFIG]
@@ -64,7 +64,7 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
   export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
-   pytorch_transformers transfo_xl \
+   transformers transfo_xl \
     $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
     $PYTORCH_DUMP_OUTPUT \
     [TRANSFO_XL_CONFIG]
@@ -80,7 +80,7 @@ Here is an example of the conversion process for a pre-trained XLNet model, fine
   export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
   export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
-   pytorch_transformers xlnet \
+   transformers xlnet \
     $TRANSFO_XL_CHECKPOINT_PATH \
     $TRANSFO_XL_CONFIG_PATH \
     $PYTORCH_DUMP_OUTPUT \
@@ -96,6 +96,6 @@ Here is an example of the conversion process for a pre-trained XLM model:
   export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
-   pytorch_transformers xlm \
+   transformers xlm \
     $XLM_CHECKPOINT_PATH \
     $PYTORCH_DUMP_OUTPUT \
--- a/docs/source/examples.md
+++ b/docs/source/examples.md
@@ -0,0 +1 @@
 ../../examples/README.md
--- a/docs/source/imgs/transformers_logo_name.png
+++ b/docs/source/imgs/transformers_logo_name.png
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,9 +1,43 @@
-Pytorch-Transformers
+Transformers
 ================================================================================================================================================
-PyTorch-Transformers is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
+🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose architectures
 (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation
 (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
-The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
+This is the documentation of our repository `transformers <https://github.com/huggingface/transformers>`__.
 Features
 ---------------------------------------------------
 - As easy to use as pytorch-transformers
 - As powerful and concise as Keras
 - High performance on NLU and NLG tasks
 - Low barrier to entry for educators and practitioners
 State-of-the-art NLP for everyone:
 - Deep learning researchers
 - Hands-on practitioners
 - AI/ML/NLP teachers and educators
 Lower compute costs, smaller carbon footprint:
 - Researchers can share trained models instead of always retraining
 - Practitioners can reduce compute time and production costs
 - 8 architectures with over 30 pretrained models, some in more than 100 languages
 Choose the right framework for every part of a model's lifetime:
 - Train state-of-the-art models in 3 lines of code
 - Deep interoperability between TensorFlow 2.0 and PyTorch models
 - Move a single model between TF2.0/PyTorch frameworks at will
 - Seamlessly pick the right framework for training, evaluation, production
 Contents
 ---------------------------------
 The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
 1. `BERT <https://github.com/google-research/bert>`_ (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 2. `GPT <https://github.com/openai/finetune-transformer-lm>`_ (from OpenAI) released with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised>`_ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
@@ -12,7 +46,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
 5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
 7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
+8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
 .. toctree::
    :maxdepth: 2
@@ -37,6 +71,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
    main_classes/model
    main_classes/tokenizer
    main_classes/optimizer_schedules
    main_classes/processors
 .. toctree::
    :maxdepth: 2
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -1,7 +1,7 @@
 Installation
 ================================================
-PyTorch-Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
+Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
 With pip
 ^^^^^^^^
@@ -10,7 +10,7 @@ PyTorch Transformers can be installed using pip as follows:
 .. code-block:: bash
-   pip install pytorch-transformers
+   pip install transformers
 From source
 ^^^^^^^^^^^
@@ -19,15 +19,15 @@ To install from source, clone the repository and install with:
 .. code-block:: bash
-    git clone https://github.com/huggingface/pytorch-transformers.git
+    git clone https://github.com/huggingface/transformers.git
-    cd pytorch-transformers
+    cd transformers
    pip install [--editable] .
 Tests
 ^^^^^
-An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`_.
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/transformers/tree/master/transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/transformers/tree/master/examples>`_.
 Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
@@ -35,7 +35,7 @@ Run all the tests from the root of the cloned repository with the commands:
 .. code-block:: bash
-    python -m pytest -sv ./pytorch_transformers/tests/
+    python -m pytest -sv ./transformers/tests/
    python -m pytest -sv ./examples/
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@@ -6,5 +6,5 @@ The base class ``PretrainedConfig`` implements the common methods for loading/sa
 ``PretrainedConfig``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.PretrainedConfig
+.. autoclass:: transformers.PretrainedConfig
    :members:
--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@@ -11,5 +11,11 @@ The base class ``PreTrainedModel`` implements the common methods for loading/sav
 ``PreTrainedModel``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.PreTrainedModel
+.. autoclass:: transformers.PreTrainedModel
    :members:
 ``TFPreTrainedModel``
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFPreTrainedModel
    :members:
--- a/docs/source/main_classes/optimizer_schedules.rst
+++ b/docs/source/main_classes/optimizer_schedules.rst
@@ -9,7 +9,7 @@ The ``.optimization`` module provides:
 ``AdamW``
 ~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.AdamW
+.. autoclass:: transformers.AdamW
    :members:
 Schedules
@@ -18,11 +18,11 @@ Schedules
 Learning Rate Schedules
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. autoclass:: pytorch_transformers.ConstantLRSchedule
+.. autoclass:: transformers.ConstantLRSchedule
    :members:
-.. autoclass:: pytorch_transformers.WarmupConstantSchedule
+.. autoclass:: transformers.WarmupConstantSchedule
    :members:
 .. image:: /imgs/warmup_constant_schedule.png
@@ -30,7 +30,7 @@ Learning Rate Schedules
    :alt:
-.. autoclass:: pytorch_transformers.WarmupCosineSchedule
+.. autoclass:: transformers.WarmupCosineSchedule
    :members:
 .. image:: /imgs/warmup_cosine_schedule.png
@@ -38,7 +38,7 @@ Learning Rate Schedules
    :alt:
-.. autoclass:: pytorch_transformers.WarmupCosineWithHardRestartsSchedule
+.. autoclass:: transformers.WarmupCosineWithHardRestartsSchedule
    :members:
 .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
@@ -47,7 +47,7 @@ Learning Rate Schedules
-.. autoclass:: pytorch_transformers.WarmupLinearSchedule
+.. autoclass:: transformers.WarmupLinearSchedule
    :members:
 .. image:: /imgs/warmup_linear_schedule.png
--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@@ -0,0 +1,58 @@
 Processors
 ----------------------------------------------------
 This library includes processors for several traditional tasks. These processors can be used to process a dataset into
 examples that can be fed to a model.
 Processors
 ~~~~~~~~~~~~~~~~~~~~~
 All processors follow the same architecture which is that of the
 :class:`~transformers.data.processors.utils.DataProcessor`. The processor returns a list
 of :class:`~transformers.data.processors.utils.InputExample`. These
 :class:`~transformers.data.processors.utils.InputExample` can be converted to
 :class:`~transformers.data.processors.utils.InputFeatures` in order to be fed to the model.
 .. autoclass:: transformers.data.processors.utils.DataProcessor
    :members:
 .. autoclass:: transformers.data.processors.utils.InputExample
    :members:
 .. autoclass:: transformers.data.processors.utils.InputFeatures
    :members:
 GLUE
 ~~~~~~~~~~~~~~~~~~~~~
 `General Language Understanding Evaluation (GLUE) <https://gluebenchmark.com/>`__ is a benchmark that evaluates
 the performance of models across a diverse set of existing NLU tasks. It was released together with the paper
 `GLUE: A multi-task benchmark and analysis platform for natural language understanding <https://openreview.net/pdf?id=rJ4km2R5t7>`__
 This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched),
 CoLA, SST2, STSB, QQP, QNLI, RTE and WNLI.
 Those processors are:
    - :class:`~transformers.data.processors.utils.MrpcProcessor`
    - :class:`~transformers.data.processors.utils.MnliProcessor`
    - :class:`~transformers.data.processors.utils.MnliMismatchedProcessor`
    - :class:`~transformers.data.processors.utils.Sst2Processor`
    - :class:`~transformers.data.processors.utils.StsbProcessor`
    - :class:`~transformers.data.processors.utils.QqpProcessor`
    - :class:`~transformers.data.processors.utils.QnliProcessor`
    - :class:`~transformers.data.processors.utils.RteProcessor`
    - :class:`~transformers.data.processors.utils.WnliProcessor`
 Additionally, the following method  can be used to load values from a data file and convert them to a list of
 :class:`~transformers.data.processors.utils.InputExample`.
 .. automethod:: transformers.data.processors.glue.glue_convert_examples_to_features
 Example usage
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 An example using these processors is given in the
 `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -12,5 +12,5 @@ The base class ``PreTrainedTokenizer`` implements the common methods for loading
 ``PreTrainedTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.PreTrainedTokenizer
+.. autoclass:: transformers.PreTrainedTokenizer
    :members:
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -1,17 +1,17 @@
 # Migrating from pytorch-pretrained-bert
-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers`
+Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`
 ### Models always output `tuples`
-The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
-The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
+The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
-Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model:
+Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
 ```python
 # Let's load our model
@@ -20,11 +20,11 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 # If you used to have this line in pytorch-pretrained-bert:
 loss = model(input_ids, labels=labels)
-# Now just use this line in pytorch-transformers to extract the loss from the output tuple:
+# Now just use this line in transformers to extract the loss from the output tuple:
 outputs = model(input_ids, labels=labels)
 loss = outputs[0]
-# In pytorch-transformers you can also have access to the logits:
+# In transformers you can also have access to the logits:
 loss, logits = outputs[:2]
 # And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
@@ -96,7 +96,7 @@ for batch in train_data:
    loss.backward()
    optimizer.step()
-### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
+### In Transformers, optimizer and schedules are splitted and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
 scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
 ### and used like this:
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -11,19 +11,19 @@ Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will di
 ``AutoConfig``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.AutoConfig
+.. autoclass:: transformers.AutoConfig
    :members:
 ``AutoModel``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.AutoModel
+.. autoclass:: transformers.AutoModel
    :members:
 ``AutoTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.AutoTokenizer
+.. autoclass:: transformers.AutoTokenizer
    :members:
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -4,69 +4,125 @@ BERT
 ``BertConfig``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertConfig
+.. autoclass:: transformers.BertConfig
    :members:
 ``BertTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertTokenizer
+.. autoclass:: transformers.BertTokenizer
    :members:
 ``BertModel``
 ~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertModel
+.. autoclass:: transformers.BertModel
    :members:
 ``BertForPreTraining``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertForPreTraining
+.. autoclass:: transformers.BertForPreTraining
    :members:
 ``BertForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertForMaskedLM
+.. autoclass:: transformers.BertForMaskedLM
    :members:
 ``BertForNextSentencePrediction``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertForNextSentencePrediction
+.. autoclass:: transformers.BertForNextSentencePrediction
    :members:
 ``BertForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertForSequenceClassification
+.. autoclass:: transformers.BertForSequenceClassification
    :members:
 ``BertForMultipleChoice``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertForMultipleChoice
+.. autoclass:: transformers.BertForMultipleChoice
    :members:
 ``BertForTokenClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertForTokenClassification
+.. autoclass:: transformers.BertForTokenClassification
    :members:
 ``BertForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertForQuestionAnswering
+.. autoclass:: transformers.BertForQuestionAnswering
    :members:
 ``TFBertModel``
 ~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertModel
    :members:
 ``TFBertForPreTraining``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForPreTraining
    :members:
 ``TFBertForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForMaskedLM
    :members:
 ``TFBertForNextSentencePrediction``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForNextSentencePrediction
    :members:
 ``TFBertForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForSequenceClassification
    :members:
 ``TFBertForMultipleChoice``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForMultipleChoice
    :members:
 ``TFBertForTokenClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForTokenClassification
    :members:
 ``TFBertForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForQuestionAnswering
    :members:
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -4,40 +4,67 @@ DistilBERT
 ``DistilBertConfig``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.DistilBertConfig
+.. autoclass:: transformers.DistilBertConfig
    :members:
 ``DistilBertTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.DistilBertTokenizer
+.. autoclass:: transformers.DistilBertTokenizer
    :members:
 ``DistilBertModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.DistilBertModel
+.. autoclass:: transformers.DistilBertModel
    :members:
 ``DistilBertForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.DistilBertForMaskedLM
+.. autoclass:: transformers.DistilBertForMaskedLM
    :members:
 ``DistilBertForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.DistilBertForSequenceClassification
+.. autoclass:: transformers.DistilBertForSequenceClassification
    :members:
 ``DistilBertForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.DistilBertForQuestionAnswering
+.. autoclass:: transformers.DistilBertForQuestionAnswering
    :members:
 ``TFDistilBertModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFDistilBertModel
    :members:
 ``TFDistilBertForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFDistilBertForMaskedLM
    :members:
 ``TFDistilBertForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFDistilBertForSequenceClassification
    :members:
 ``TFDistilBertForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFDistilBertForQuestionAnswering
    :members:
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -4,33 +4,54 @@ OpenAI GPT
 ``OpenAIGPTConfig``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.OpenAIGPTConfig
+.. autoclass:: transformers.OpenAIGPTConfig
    :members:
 ``OpenAIGPTTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.OpenAIGPTTokenizer
+.. autoclass:: transformers.OpenAIGPTTokenizer
    :members:
 ``OpenAIGPTModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.OpenAIGPTModel
+.. autoclass:: transformers.OpenAIGPTModel
    :members:
 ``OpenAIGPTLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.OpenAIGPTLMHeadModel
+.. autoclass:: transformers.OpenAIGPTLMHeadModel
    :members:
 ``OpenAIGPTDoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.OpenAIGPTDoubleHeadsModel
+.. autoclass:: transformers.OpenAIGPTDoubleHeadsModel
    :members:
 ``TFOpenAIGPTModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFOpenAIGPTModel
    :members:
 ``TFOpenAIGPTLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFOpenAIGPTLMHeadModel
    :members:
 ``TFOpenAIGPTDoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFOpenAIGPTDoubleHeadsModel
    :members:
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -4,33 +4,54 @@ OpenAI GPT2
 ``GPT2Config``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.GPT2Config
+.. autoclass:: transformers.GPT2Config
    :members:
 ``GPT2Tokenizer``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.GPT2Tokenizer
+.. autoclass:: transformers.GPT2Tokenizer
    :members:
 ``GPT2Model``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.GPT2Model
+.. autoclass:: transformers.GPT2Model
    :members:
 ``GPT2LMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.GPT2LMHeadModel
+.. autoclass:: transformers.GPT2LMHeadModel
    :members:
 ``GPT2DoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.GPT2DoubleHeadsModel
+.. autoclass:: transformers.GPT2DoubleHeadsModel
    :members:
 ``TFGPT2Model``
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFGPT2Model
    :members:
 ``TFGPT2LMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFGPT2LMHeadModel
    :members:
 ``TFGPT2DoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFGPT2DoubleHeadsModel
    :members:
--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@@ -4,33 +4,54 @@ RoBERTa
 ``RobertaConfig``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.RobertaConfig
+.. autoclass:: transformers.RobertaConfig
    :members:
 ``RobertaTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.RobertaTokenizer
+.. autoclass:: transformers.RobertaTokenizer
    :members:
 ``RobertaModel``
 ~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.RobertaModel
+.. autoclass:: transformers.RobertaModel
    :members:
 ``RobertaForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.RobertaForMaskedLM
+.. autoclass:: transformers.RobertaForMaskedLM
    :members:
 ``RobertaForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.RobertaForSequenceClassification
+.. autoclass:: transformers.RobertaForSequenceClassification
    :members:
 ``TFRobertaModel``
 ~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFRobertaModel
    :members:
 ``TFRobertaForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFRobertaForMaskedLM
    :members:
 ``TFRobertaForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFRobertaForSequenceClassification
    :members:
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -5,26 +5,40 @@ Transformer XL
 ``TransfoXLConfig``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.TransfoXLConfig
+.. autoclass:: transformers.TransfoXLConfig
    :members:
 ``TransfoXLTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.TransfoXLTokenizer
+.. autoclass:: transformers.TransfoXLTokenizer
    :members:
 ``TransfoXLModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.TransfoXLModel
+.. autoclass:: transformers.TransfoXLModel
    :members:
 ``TransfoXLLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.TransfoXLLMHeadModel
+.. autoclass:: transformers.TransfoXLLMHeadModel
    :members:
 ``TFTransfoXLModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFTransfoXLModel
    :members:
 ``TFTransfoXLLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFTransfoXLLMHeadModel
    :members:
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -4,38 +4,66 @@ XLM
 ``XLMConfig``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLMConfig
+.. autoclass:: transformers.XLMConfig
    :members:
 ``XLMTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLMTokenizer
+.. autoclass:: transformers.XLMTokenizer
    :members:
 ``XLMModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLMModel
+.. autoclass:: transformers.XLMModel
    :members:
 ``XLMWithLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLMWithLMHeadModel
+.. autoclass:: transformers.XLMWithLMHeadModel
    :members:
 ``XLMForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLMForSequenceClassification
+.. autoclass:: transformers.XLMForSequenceClassification
    :members:
 ``XLMForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLMForQuestionAnswering
+.. autoclass:: transformers.XLMForQuestionAnswering
    :members:
 ``TFXLMModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLMModel
    :members:
 ``TFXLMWithLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLMWithLMHeadModel
    :members:
 ``TFXLMForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLMForSequenceClassification
    :members:
 ``TFXLMForQuestionAnsweringSimple``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLMForQuestionAnsweringSimple
    :members:
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -4,40 +4,68 @@ XLNet
 ``XLNetConfig``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLNetConfig
+.. autoclass:: transformers.XLNetConfig
    :members:
 ``XLNetTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLNetTokenizer
+.. autoclass:: transformers.XLNetTokenizer
    :members:
 ``XLNetModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLNetModel
+.. autoclass:: transformers.XLNetModel
    :members:
 ``XLNetLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLNetLMHeadModel
+.. autoclass:: transformers.XLNetLMHeadModel
    :members:
 ``XLNetForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLNetForSequenceClassification
+.. autoclass:: transformers.XLNetForSequenceClassification
    :members:
 ``XLNetForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLNetForQuestionAnswering
+.. autoclass:: transformers.XLNetForQuestionAnswering
    :members:
 ``TFXLNetModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLNetModel
    :members:
 ``TFXLNetLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLNetLMHeadModel
    :members:
 ``TFXLNetForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLNetForSequenceClassification
    :members:
 ``TFXLNetForQuestionAnsweringSimple``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLNetForQuestionAnsweringSimple
    :members:
--- a/docs/source/notebooks.rst
+++ b/docs/source/notebooks.rst
@@ -1,16 +1,16 @@
 Notebooks
 ================================================
-We include `three Jupyter Notebooks <https://github.com/huggingface/pytorch-transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
+We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
 *
-  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
+  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
 *
-  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
+  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
 *
-  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
+  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
 Please follow the instructions given in the notebooks to run and modify them.
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -44,15 +44,15 @@ Here is the full list of the currently provided pretrained models together with
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
 |                   |                                                            | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                                                             |
-|                   |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`__).   |
+|                   |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/transformers/tree/master/examples>`__).           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-cased-whole-word-masking-finetuned-squad``    | | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                                    |
 |                   |                                                            | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                               |
-|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)                   |
+|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-cased-finetuned-mrpc``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | The ``bert-base-cased`` model fine-tuned on MRPC                                                                                    |
-|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)                   |
+|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | OpenAI GPT English model                                                                                                            |
@@ -98,6 +98,12 @@ Here is the full list of the currently provided pretrained models together with
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-clm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
 |                   |                                                            | | XLM English-German model trained with CLM (Causal Language Modeling) on the concatenation of English and German wikipedia           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-17-1280``                                        | | 16-layer, 1280-hidden, 16-heads                                                                                                     |
 |                   |                                                            | | XLM model trained with MLM (Masked Language Modeling) on 17 languages.                                                              |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-100-1280``                                       | | 16-layer, 1280-hidden, 16-heads                                                                                                     |
 |                   |                                                            | | XLM model trained with MLM (Masked Language Modeling) on 100 languages.                                                             |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | RoBERTa           | ``roberta-base``                                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
 |                   |                                                            | | RoBERTa using the BERT-base architecture                                                                                            |
@@ -113,11 +119,14 @@ Here is the full list of the currently provided pretrained models together with
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | DistilBERT        | ``distilbert-base-uncased``                                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint                                                   |
-|                   |                                                            | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__)                                                            |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilbert-base-uncased-distilled-squad``                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer.                 |
-|                   |                                                            | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__)                                                            |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilgpt2``                                             | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
 |                   |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-
+.. <https://huggingface.co/transformers/examples.html>`__
 .. <https://huggingface.co/pytorch-transformers/examples.html>`__
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -2,7 +2,7 @@
 ## Philosophy
-PyTorch-Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
+Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
 The library was designed with two strong goals in mind:
@@ -19,12 +19,12 @@ The library was designed with two strong goals in mind:
 A few other goals:
- expose the models internals as consistently as possible:
+- expose the models' internals as consistently as possible:
  - we give access, using a single API to the full hidden-states and attention weights,
  - tokenizer and base model's API are standardized to easily switch between models.
- incorporate a subjective selection of promising tools for fine-tuning/investiguating these models:
+- incorporate a subjective selection of promising tools for fine-tuning/investigating these models:
  - a simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning,
  - simple ways to mask and prune transformer heads.
@@ -33,13 +33,13 @@ A few other goals:
 The library is build around three type of classes for each models:
- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 6 models architectures currently provided in the library, e.g. `BertModel`
+- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 8 models architectures currently provided in the library, e.g. `BertModel`
 - **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig`. You don't always need to instantiate these your-self, in particular if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model)
 - **tokenizer classes** which store the vocabulary for each model and provide methods for encoding/decoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer`
 All these classes can be instantiated from pretrained instances and saved locally using two methods:
- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/pytorch-transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
+- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
 - `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.
 We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts:
@@ -51,7 +51,7 @@ We'll finish this quickstart tour by going through a few simple quick-start exam
 Here are two examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.
-See full API reference for examples for each model classe.
+See full API reference for examples for each model class.
 ### BERT example
@@ -59,7 +59,7 @@ Let's start by preparing a tokenized input (a list of token embeddings indices t
 ```python
 import torch
-from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
+from transformers import BertTokenizer, BertModel, BertForMaskedLM
 # OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
 import logging
@@ -93,8 +93,8 @@ Let's see how we can use `BertModel` to encode our inputs in hidden-states:
 # Load pre-trained model (weights)
 model = BertModel.from_pretrained('bert-base-uncased')
-# Set the model in evaluation mode to desactivate the DropOut modules
+# Set the model in evaluation mode to deactivate the DropOut modules
-# This is IMPORTANT to have reproductible results during evaluation!
+# This is IMPORTANT to have reproducible results during evaluation!
 model.eval()
 # If you have a GPU, put everything on cuda
@@ -106,7 +106,7 @@ model.to('cuda')
 with torch.no_grad():
    # See the models docstrings for the detail of the inputs
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
-    # PyTorch-Transformers models always output tuples.
+    # Transformers models always output tuples.
    # See the models docstrings for the detail of all the outputs
    # In our case, the first element is the hidden state of the last layer of the Bert model
    encoded_layers = outputs[0]
@@ -145,7 +145,7 @@ First let's prepare a tokenized input from our text string using `GPT2Tokenizer`
 ```python
 import torch
-from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
@@ -168,8 +168,8 @@ Let's see how to use `GPT2LMHeadModel` to generate the next token following our
 # Load pre-trained model (weights)
 model = GPT2LMHeadModel.from_pretrained('gpt2')
-# Set the model in evaluation mode to desactivate the DropOut modules
+# Set the model in evaluation mode to deactivate the DropOut modules
-# This is IMPORTANT to have reproductible results during evaluation!
+# This is IMPORTANT to have reproducible results during evaluation!
 model.eval()
 # If you have a GPU, put everything on cuda
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -45,7 +45,7 @@ where
    * ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
    * ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
-  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
+  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
 *
  ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
@@ -122,7 +122,7 @@ Here is the recommended way of saving the model, configuration and vocabulary to
 .. code-block:: python
-   from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+   from transformers import WEIGHTS_NAME, CONFIG_NAME
   output_dir = "./models/"
--- a/docs/source/torchscript.rst
+++ b/docs/source/torchscript.rst
@@ -12,7 +12,7 @@ According to Pytorch's documentation: "TorchScript is a way to create serializab
 Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
 their model to be re-used in other programs, such as efficiency-oriented C++ programs.
-We have provided an interface that allows the export of `pytorch-transformers` models to TorchScript so that they can
+We have provided an interface that allows the export of `transformers` models to TorchScript so that they can
 be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that
 they can be exported, and what to be mindful of when using these models with TorchScript.
@@ -74,7 +74,7 @@ according to a ``BertConfig`` class and then saved to disk under the filename ``
 .. code-block:: python
-    from pytorch_transformers import BertModel, BertTokenizer, BertConfig
+    from transformers import BertModel, BertTokenizer, BertConfig
    import torch
    enc = BertTokenizer.from_pretrained("bert-base-uncased")
--- a/examples/README.md
+++ b/examples/README.md
@@ -13,7 +13,7 @@ similar API between the different models.
 ## Language model fine-tuning
-Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_lm_finetuning.py).
+Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_lm_finetuning.py).
 Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT 
 to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa 
@@ -75,7 +75,7 @@ python run_lm_finetuning.py \
 ## Language generation
-Based on the script [`run_generation.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_generation.py).
+Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py).
 Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
 A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
@@ -91,7 +91,7 @@ python run_generation.py \
 ## GLUE
-Based on the script [`run_glue.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py).
+Based on the script [`run_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py).
 Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding 
 Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa. 
@@ -319,7 +319,7 @@ eval_loss = 0.44457291918821606
 ## SQuAD
-Based on the script [`run_squad.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py).
+Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py).
 #### Fine-tuning on SQuAD
--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@@ -39,7 +39,7 @@ import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
-from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
+from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
                                     AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
                                     WarmupLinearSchedule)
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -35,10 +35,10 @@ from tqdm import tqdm, trange
 from tensorboardX import SummaryWriter
-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                  BertForMultipleChoice, BertTokenizer)
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
 logger = logging.getLogger(__name__)
@@ -365,7 +365,7 @@ def train(args, train_dataset, model, tokenizer):
            #     inputs.update({'cls_index': batch[5],
            #                    'p_mask':       batch[6]})
            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            if args.n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
@@ -647,7 +647,7 @@ def main():
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
--- a/examples/contrib/run_transfo_xl.py
+++ b/examples/contrib/run_transfo_xl.py
@@ -28,7 +28,7 @@ import math
 import torch
-from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
+from transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -1,35 +1,40 @@
-# DistilBERT
+# Distil*
-This folder contains the original code used to train DistilBERT as well as examples showcasing how to use DistilBERT.
+This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT and DistilGPT2.
 **2019, October 3rd - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2.
 **2019, September 19th - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 97% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!
-## What is DistilBERT
+## What is Distil*
-DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
+Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
-For more information on DistilBERT, please refer to our [detailed blog post](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5
+We have applied the same method to GPT2 and release the weights of the compressed model. On the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test of 15.8 compared to 19.3 for DistilGPT2 (after fine-tuning on the train set).
 ). *Please note that we will publish a formal write-up with updated and more complete results in the near future (September 19th).*
-Here's the updated results on the dev sets of GLUE:
+For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108). The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances.
-| Model      | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP | RTE | SST-2 | STS-B | WNLI |
+Here are the results on the dev sets of GLUE:
 | Model      | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP  | RTE  | SST-2| STS-B| WNLI |
 | :---:      |    :---:    | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:|
 | BERT-base  |  **77.6**   | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 |
-| DistilBERT |  **75.2**   | 49.1 | 81.8 | 90.2 | 87.0 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
+| DistilBERT |  **76.8**   | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
 ## Setup
 This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`. 
-**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/pytorch-transformers/issues/1179) for more details.
+**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/transformers/issues/1179) for more details.
 ## How to use DistilBERT
-PyTorch-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
+Transformers includes two pre-trained Distil* models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
 - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
 - `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
 - `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset and . The model has 6 layers, 768 dimension and 12 heads, totalizing 82M (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
 - and more to come! 🤗🤗🤗
 Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
@@ -42,9 +47,11 @@ outputs = model(input_ids)
 last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 ```
-## How to train DistilBERT
+Similarly, using DistilGPT2 simply consists in calling the GPT2 classes from a different pretrained checkpoint: `model = GPT2Model.from_pretrained('distilgpt2')`.
-In the following, we will explain how you can train your own compressed model.
+## How to train Distil*
 In the following, we will explain how you can train DistilBERT.
 ### A. Preparing the data
@@ -57,7 +64,8 @@ First, we will binarize the data, i.e. tokenize the data and convert each token
 ```bash
 python scripts/binarized_data.py \
    --file_path data/dump.txt \
-    --bert_tokenizer bert-base-uncased \
+    --tokenizer_type bert \
    --tokenizer_name bert-base-uncased \
    --dump_file data/binarized_text
 ```
@@ -66,7 +74,8 @@ Our implementation of masked language modeling loss follows [XLM](https://github
 ```bash
 python scripts/token_counts.py \
    --data_file data/binarized_text.bert-base-uncased.pickle \
-    --token_counts_dump data/token_counts.bert-base-uncased.pickle
+    --token_counts_dump data/token_counts.bert-base-uncased.pickle \
    --vocab_size 30522
 ```
 ### B. Training
@@ -75,6 +84,12 @@ Training with distillation is really simple once you have pre-processed the data
 ```bash
 python train.py \
    --student_type distilbert \
    --student_config training_configs/distilbert-base-uncased.json \
    --teacher_type bert \
    --teacher_name bert-base-uncased \
    --alpha_ce 5.0 --alpha_mlm 2.0 --alpha_cos 1.0 --mlm \
    --freeze_pos_embs \
    --dump_path serialization_dir/my_first_training \
    --data_file data/binarized_text.bert-base-uncased.pickle \
    --token_counts data/token_counts.bert-base-uncased.pickle \
@@ -83,7 +98,7 @@ python train.py \
 By default, this will launch a training on a single GPU (even if more are available on the cluster). Other parameters are available in the command line, please look in `train.py` or run `python train.py --help` to list them.
-We highly encourage you to use distributed training for training DistilBert as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
+We highly encourage you to use distributed training for training DistilBERT as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
 ```bash
 export NODE_RANK=0
@@ -105,11 +120,17 @@ python -m torch.distributed.launch \
    train.py \
        --force \
        --n_gpu $WORLD_SIZE \
        --student_type distilbert \
        --student_config training_configs/distilbert-base-uncased.json \
        --teacher_type bert \
        --teacher_name bert-base-uncased \
        --alpha_ce 0.33 --alpha_mlm 0.33 --alpha_cos 0.33 --mlm \
        --freeze_pos_embs \
        --dump_path serialization_dir/my_first_training \
        --data_file data/binarized_text.bert-base-uncased.pickle \
-        --token_counts data/token_counts.bert-base-uncased.pickle \
+        --token_counts data/token_counts.bert-base-uncased.pickle
        --dump_path serialization_dir/my_first_distillation
 ```
-**Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract_for_distil.py` to create a valid initialization checkpoint and use `--from_pretrained_weights` and `--from_pretrained_config` arguments to use this initialization for the distilled training!
+**Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract.py` and `scripts/extract_distilbert.py` to create a valid initialization checkpoint and use `--student_pretrained_weights` argument to use this initialization for the distilled training!
 Happy distillation!
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -12,8 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" The distiller to distil DistilBERT
+""" The distiller to distil the student.
-    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
+    Adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
 import os
 import math
@@ -28,16 +28,19 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.optim import AdamW
 from torch.utils.data.distributed import DistributedSampler
 from torch.utils.data import RandomSampler, BatchSampler, DataLoader
-from pytorch_transformers import WarmupLinearSchedule
+from transformers import WarmupLinearSchedule
 from utils import logger
-from dataset import Dataset
+from lm_seqs_dataset import LmSeqsDataset
 from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
 class Distiller:
    def __init__(self,
                 params: dict,
-                 dataloader: Dataset,
+                 dataset: LmSeqsDataset,
                 token_probs: torch.tensor,
                 student: nn.Module,
                 teacher: nn.Module):
@@ -50,33 +53,47 @@ class Distiller:
        self.student = student
        self.teacher = teacher
-        self.dataloader = dataloader
+        self.student_config = student.config
-        if self.params.n_gpu > 1:
+        self.vocab_size = student.config.vocab_size
-            self.dataloader.split()
+
-        self.get_iterator(seed=params.seed)
+        if params.n_gpu <= 1:
            sampler = RandomSampler(dataset)
        else:
            sampler = DistributedSampler(dataset)
        if params.group_by_size:
            groups = create_lengths_groups(lengths=dataset.lengths, k=params.max_model_input_size)
            sampler = GroupedBatchSampler(sampler=sampler, group_ids=groups, batch_size=params.batch_size)
        else:
            sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False)
        self.dataloader = DataLoader(dataset=dataset,
                                     batch_sampler=sampler,
                                     collate_fn=dataset.batch_sequences)
        self.temperature = params.temperature
        assert self.temperature > 0.
        self.alpha_ce = params.alpha_ce
        self.alpha_mlm = params.alpha_mlm
        self.alpha_clm = params.alpha_clm
        self.alpha_mse = params.alpha_mse
        self.alpha_cos = params.alpha_cos
        assert self.alpha_ce >= 0.
        assert self.alpha_mlm >= 0.
        assert self.alpha_mse >= 0.
        assert self.alpha_cos >= 0.
        assert self.alpha_ce + self.alpha_mlm + self.alpha_mse + self.alpha_cos > 0.
-        self.mlm_mask_prop = params.mlm_mask_prop
+        self.mlm = params.mlm
-        assert 0.0 <= self.mlm_mask_prop <= 1.0
+        if self.mlm:
-        assert params.word_mask + params.word_keep + params.word_rand == 1.0
+            logger.info(f'Using MLM loss for LM step.')
-        self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
+            self.mlm_mask_prop = params.mlm_mask_prop
-        self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs
+            assert 0.0 <= self.mlm_mask_prop <= 1.0
-        self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs
+            assert params.word_mask + params.word_keep + params.word_rand == 1.0
-        if self.fp16:
+            self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
-            self.pred_probs = self.pred_probs.half()
+            self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs
-            self.token_probs = self.token_probs.half()
+            self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs
            if self.fp16:
                self.pred_probs = self.pred_probs.half()
                self.token_probs = self.token_probs.half()
        else:
            logger.info(f'Using CLM loss for LM step.')
        self.epoch = 0
        self.n_iter = 0
@@ -86,12 +103,13 @@ class Distiller:
        self.last_loss = 0
        self.last_loss_ce = 0
        self.last_loss_mlm = 0
        self.last_loss_clm = 0
        if self.alpha_mse > 0.: self.last_loss_mse = 0
        if self.alpha_cos > 0.: self.last_loss_cos = 0
        self.last_log = 0
        self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
-        self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
+        self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
        if self.alpha_mse > 0.:
            self.mse_loss_fct = nn.MSELoss(reduction='sum')
        if self.alpha_cos > 0.:
@@ -99,7 +117,7 @@ class Distiller:
        logger.info('--- Initializing model optimizer')
        assert params.gradient_accumulation_steps >= 1
-        self.num_steps_epoch = int(len(self.dataloader) / params.batch_size) + 1
+        self.num_steps_epoch = len(self.dataloader)
        num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
        no_decay = ['bias', 'LayerNorm.weight']
@@ -140,43 +158,18 @@ class Distiller:
                logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
                self.student = DistributedDataParallel(self.student,
                                                       device_ids=[params.local_rank],
-                                                       output_device=params.local_rank)
+                                                       output_device=params.local_rank,
                                                       find_unused_parameters=True)
        self.is_master = params.is_master
        if self.is_master:
            logger.info('--- Initializing Tensorboard')
            self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, 'log', 'train'))
-            self.tensorboard.add_text(tag='config', text_string=str(self.params), global_step=0)
+            self.tensorboard.add_text(tag='config/training', text_string=str(self.params), global_step=0)
            self.tensorboard.add_text(tag='config/student', text_string=str(self.student_config), global_step=0)
-    def get_iterator(self,
+    def prepare_batch_mlm(self,
-                     seed: int = None):
+                          batch):
        """
        Initialize the data iterator.
        Each process has its own data iterator (iterating on his own random portion of the dataset).
        Input:
        ------
            seed: `int` - The random seed.
        """
        logger.info('--- Initializing Data Iterator')
        self.data_iterator = self.dataloader.get_iterator(seed=seed)
    def get_batch(self):
        """
        Call the data iterator to output a new batch.
        If the data iterator went through the whole dataset, create a new iterator.
        """
        assert hasattr(self, 'data_iterator')
        try:
            x = next(self.data_iterator)
        except StopIteration:
            logger.warning('--- Went through the whole dataset. Creating new data iterator.')
            self.data_iterator = self.dataloader.get_iterator()
            x = next(self.data_iterator)
        return x
    def prepare_batch(self,
                      batch):
        """
        Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM.
@@ -222,7 +215,7 @@ class Distiller:
                assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item()
        _token_ids_real = token_ids[pred_mask]
-        _token_ids_rand = _token_ids_real.clone().random_(self.params.vocab_size)
+        _token_ids_rand = _token_ids_real.clone().random_(self.vocab_size)
        _token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids['mask_token'])
        probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True)
        _token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
@@ -230,8 +223,41 @@ class Distiller:
        mlm_labels[~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
        # sanity checks
        assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
        return token_ids, attn_mask, mlm_labels
    def prepare_batch_clm(self,
                          batch):
        """
        Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM.
        Input:
        ------
            batch: `Tuple`
                token_ids: `torch.tensor(bs, seq_length)` - The token ids for each of the sequence. It is padded.
                lengths: `torch.tensor(bs)` - The lengths of each of the sequences in the batch.
        Output:
        -------
            token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
            attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
            clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -1 where there is nothing to predict.
        """
        token_ids, lengths = batch
        token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
        assert token_ids.size(0) == lengths.size(0)
        attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
        clm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
        clm_labels[~attn_mask] = -1 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
        # sanity checks
        assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
        return token_ids, attn_mask, clm_labels
    def round_batch(self,
                    x: torch.tensor,
                    lengths: torch.tensor):
@@ -269,7 +295,10 @@ class Distiller:
        if ml1 % 8 != 0:
            pad = 8 - (ml1 % 8)
            ml2 = ml1 + pad
-            pad_id = self.params.special_tok_ids['pad_token']
+            if self.mlm:
                pad_id = self.params.special_tok_ids['pad_token']
            else:
                pad_id = self.params.special_tok_ids['unk_token']
            padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id)
            x = torch.cat([x, padding_tensor], 1)
            assert x.size() == (bs2, ml2)
@@ -292,14 +321,16 @@ class Distiller:
            if self.multi_gpu:
                torch.distributed.barrier()
-            iter_bar = trange(self.num_steps_epoch, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
+            iter_bar = tqdm(self.dataloader, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
-            for __ in range(self.num_steps_epoch):
+            for batch in iter_bar:
                batch = self.get_batch()
                if self.params.n_gpu > 0:
                    batch = tuple(t.to(f'cuda:{self.params.local_rank}') for t in batch)
                token_ids, attn_mask, mlm_labels = self.prepare_batch(batch=batch)
-                self.step(input_ids=token_ids, attention_mask=attn_mask, mlm_labels=mlm_labels)
+                if self.mlm:
                    token_ids, attn_mask, lm_labels = self.prepare_batch_mlm(batch=batch)
                else:
                    token_ids, attn_mask, lm_labels = self.prepare_batch_clm(batch=batch)
                self.step(input_ids=token_ids, attention_mask=attn_mask, lm_labels=lm_labels)
                iter_bar.update()
                iter_bar.set_postfix({'Last_loss': f'{self.last_loss:.2f}',
@@ -317,7 +348,7 @@ class Distiller:
    def step(self,
             input_ids: torch.tensor,
             attention_mask: torch.tensor,
-             mlm_labels: torch.tensor):
+             lm_labels: torch.tensor):
        """
        One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation),
        and possibly a parameter update (depending on the gradient accumulation).
@@ -326,17 +357,22 @@ class Distiller:
        ------
        input_ids: `torch.tensor(bs, seq_length)` - The token ids.
        attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention.
-        mlm_labels: `torch.tensor(bs, seq_length)` - The masked language modeling labels.
+        lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM).
        """
-        s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask)     # (bs, seq_length, voc_size)
+        if self.mlm:
-        with torch.no_grad():
+            s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask)     # (bs, seq_length, voc_size)
-            t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
+            with torch.no_grad():
                t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
        else:
            s_logits, _, s_hidden_states = self.student(input_ids=input_ids, attention_mask=None)            # (bs, seq_length, voc_size)
            with torch.no_grad():
                t_logits, _, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=None)           # (bs, seq_length, voc_size)
        assert s_logits.size() == t_logits.size()
        #https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
        #https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
        if self.params.restrict_ce_to_mask:
-            mask = (mlm_labels>-1).unsqueeze(-1).expand_as(s_logits)   # (bs, seq_lenth, voc_size)
+            mask = (lm_labels>-1).unsqueeze(-1).expand_as(s_logits)    # (bs, seq_lenth, voc_size)
        else:
            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)    # (bs, seq_lenth, voc_size)
        s_logits_slct = torch.masked_select(s_logits, mask)            # (bs * seq_length * voc_size) modulo the 1s in mask
@@ -348,13 +384,20 @@ class Distiller:
        loss_ce = self.ce_loss_fct(F.log_softmax(s_logits_slct/self.temperature, dim=-1),
                                   F.softmax(t_logits_slct/self.temperature, dim=-1)) * (self.temperature)**2
        loss = self.alpha_ce*loss_ce
        if self.alpha_mlm > 0.:
-            loss_mlm = self.mlm_loss_fct(s_logits.view(-1, s_logits.size(-1)), mlm_labels.view(-1))
+            loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1))
            loss += self.alpha_mlm * loss_mlm
        if self.alpha_clm > 0.:
            shift_logits = s_logits[..., :-1, :].contiguous()
            shift_labels = lm_labels[..., 1:].contiguous()
            loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                                        shift_labels.view(-1))
            loss += self.alpha_clm * loss_clm
        if self.alpha_mse > 0.:
            loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction
            loss += self.alpha_mse * loss_mse
        if self.alpha_cos > 0.:
            s_hidden_states = s_hidden_states[-1]                              # (bs, seq_length, dim)
            t_hidden_states = t_hidden_states[-1]                              # (bs, seq_length, dim)
@@ -376,6 +419,8 @@ class Distiller:
        self.last_loss_ce = loss_ce.item()
        if self.alpha_mlm > 0.:
            self.last_loss_mlm = loss_mlm.item()
        if self.alpha_clm > 0.:
            self.last_loss_clm = loss_clm.item()
        if self.alpha_mse > 0.:
            self.last_loss_mse = loss_mse.item()
        if self.alpha_cos > 0.:
@@ -452,6 +497,8 @@ class Distiller:
        self.tensorboard.add_scalar(tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter)
        if self.alpha_mlm > 0.:
            self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter)
        if self.alpha_clm > 0.:
            self.tensorboard.add_scalar(tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter)
        if self.alpha_mse > 0.:
            self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
        if self.alpha_cos > 0.:
--- a/examples/distillation/grouped_batch_sampler.py
+++ b/examples/distillation/grouped_batch_sampler.py
@@ -0,0 +1,105 @@
 # coding=utf-8
 # Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Adapted from PyTorch Vision (https://github.com/pytorch/vision/blob/master/references/detection/group_by_aspect_ratio.py)
 """
 import bisect
 import copy
 from collections import defaultdict
 import numpy as np
 from torch.utils.data.sampler import BatchSampler, Sampler
 from utils import logger
 def _quantize(x, bins):
    bins = copy.deepcopy(bins)
    bins = sorted(bins)
    quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
    return quantized
 def create_lengths_groups(lengths, k=0):
    bins = np.arange(start=3, stop=k, step=4).tolist() if k > 0 else [10]
    groups = _quantize(lengths, bins)
    # count number of elements per group
    counts = np.unique(groups, return_counts=True)[1]
    fbins = [0] + bins + [np.inf]
    logger.info("Using {} as bins for aspect lengths quantization".format(fbins))
    logger.info("Count of instances per bin: {}".format(counts))
    return groups
 class GroupedBatchSampler(BatchSampler):
    """
    Wraps another sampler to yield a mini-batch of indices.
    It enforces that the batch only contain elements from the same group.
    It also tries to provide mini-batches which follows an ordering which is
    as close as possible to the ordering from the original sampler.
    Arguments:
        sampler (Sampler): Base sampler.
        group_ids (list[int]): If the sampler produces indices in range [0, N),
            `group_ids` must be a list of `N` ints which contains the group id of each sample.
            The group ids must be a continuous set of integers starting from
            0, i.e. they must be in the range [0, num_groups).
        batch_size (int): Size of mini-batch.
    """
    def __init__(self, sampler, group_ids, batch_size):
        if not isinstance(sampler, Sampler):
            raise ValueError(
                "sampler should be an instance of "
                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
            )
        self.sampler = sampler
        self.group_ids = group_ids
        self.batch_size = batch_size
    def __iter__(self):
        buffer_per_group = defaultdict(list)
        samples_per_group = defaultdict(list)
        num_batches = 0
        for idx in self.sampler:
            group_id = self.group_ids[idx]
            buffer_per_group[group_id].append(idx)
            samples_per_group[group_id].append(idx)
            if len(buffer_per_group[group_id]) == self.batch_size:
                yield buffer_per_group[group_id] #TODO
                num_batches += 1
                del buffer_per_group[group_id]
            assert len(buffer_per_group[group_id]) < self.batch_size
        # now we have run out of elements that satisfy
        # the group criteria, let's return the remaining
        # elements so that the size of the sampler is
        # deterministic
        expected_num_batches = len(self)
        num_remaining = expected_num_batches - num_batches
        if num_remaining > 0:
            # for the remaining batches, group the batches by similar lengths
            batch_idx = []
            for group_id, idxs in sorted(buffer_per_group.items(), key=lambda x: x[0]):
                batch_idx.extend(idxs)
                if len(batch_idx) >= self.batch_size:
                    yield batch_idx[:self.batch_size]
                    batch_idx = batch_idx[self.batch_size:]
                    num_remaining -= 1
            if len(batch_idx) > 0:
                yield batch_idx
                num_remaining -= 1
        assert num_remaining == 0
    def __len__(self):
        """
        Return the number of mini-batches rather than the number of samples.
        """
        return (len(self.sampler) + self.batch_size - 1) // self.batch_size
--- a/examples/distillation/lm_seqs_dataset.py
+++ b/examples/distillation/lm_seqs_dataset.py
@@ -12,30 +12,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Dataloaders to train DistilBERT
+""" Dataset to distilled models
    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
 from typing import List
 import math
 from itertools import chain
 from collections import Counter
 import numpy as np
 import torch
 from torch.utils.data import Dataset
 import numpy as np
 from utils import logger
-class Dataset:
+class LmSeqsDataset(Dataset):
    """Custom Dataset wrapping language modeling sequences.
    Each sample will be retrieved by indexing the list of token_ids and their corresponding lengths.
    Input:
    ------
        params: `NameSpace` parameters
        data: `List[np.array[int]]
    """
    def __init__(self,
                 params,
                 data):
        self.params = params
        self.tokens_per_batch = params.tokens_per_batch
        self.batch_size = params.batch_size
        self.shuffle = params.shuffle
        self.group_by_size = params.group_by_size
        self.token_ids = np.array(data)
-        self.lengths = np.uint16([len(t) for t in data])
+        self.lengths = np.array([len(t) for t in data])
        self.check()
        self.remove_long_sequences()
@@ -43,6 +46,9 @@ class Dataset:
        self.check()
        self.print_statistics()
    def __getitem__(self, index):
        return (self.token_ids[index], self.lengths[index])
    def __len__(self):
        return len(self.lengths)
@@ -51,12 +57,14 @@ class Dataset:
        Some sanity checks
        """
        assert len(self.token_ids) == len(self.lengths)
        assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths))) 
    def remove_long_sequences(self):
        """
-        Sequences that are too long are splitted by chunk of max_position_embeddings.
+        Sequences that are too long are splitted by chunk of max_model_input_size.
        """
-        indices = self.lengths >= self.params.max_position_embeddings
+        max_len = self.params.max_model_input_size
        indices = self.lengths > max_len
        logger.info(f'Splitting {sum(indices)} too long sequences.')
        def divide_chunks(l, n):
@@ -64,10 +72,13 @@ class Dataset:
        new_tok_ids = []
        new_lengths = []
-        cls_id, sep_id = self.params.special_tok_ids['cls_token'], self.params.special_tok_ids['sep_token']
+        if self.params.mlm:
-        max_len = self.params.max_position_embeddings
+            cls_id, sep_id = self.params.special_tok_ids['cls_token'], self.params.special_tok_ids['sep_token']
        else:
            cls_id, sep_id = self.params.special_tok_ids['bos_token'], self.params.special_tok_ids['eos_token']
        for seq_, len_ in zip(self.token_ids, self.lengths):
            assert (seq_[0] == cls_id) and (seq_[-1] == sep_id), seq_
            if len_ <= max_len:
                new_tok_ids.append(seq_)
                new_lengths.append(len_)
@@ -79,6 +90,7 @@ class Dataset:
                    if sub_s[-1] != sep_id:
                        sub_s = np.insert(sub_s, len(sub_s), sep_id)
                    assert len(sub_s) <= max_len
                    assert (sub_s[0] == cls_id) and (sub_s[-1] == sep_id), sub_s
                    sub_seqs.append(sub_s)
                new_tok_ids.extend(sub_seqs)
@@ -113,89 +125,27 @@ class Dataset:
        # nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
        # logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
    def select_data(self, a: int, b: int):
        """
        Select a subportion of the data.
        """
        n_sequences = len(self)
        assert 0 <= a < b <= n_sequences, ValueError(f'`0 <= a < b <= n_sequences` is not met with a={a} and b={b}')
        logger.info(f'Selecting sequences from {a} to {b} (excluded).')
        self.token_ids = self.token_ids[a:b]
        self.lengths = self.lengths[a:b]
        self.check()
    def split(self):
        """
        Distributed training: split the data accross the processes.
        """
        assert self.params.n_gpu > 1
        logger.info('Splitting the data accross the processuses.')
        n_seq = len(self)
        n_seq_per_procesus = n_seq // self.params.world_size
        a = n_seq_per_procesus * self.params.global_rank
        b = a + n_seq_per_procesus
        self.select_data(a=a, b=b)
    def batch_sequences(self,
-                        token_ids: List[List[int]],
+                        batch):
                        lengths: List[int]):
        """
        Do the padding and transform into torch.tensor.
        """
        token_ids = [t[0] for t in batch]
        lengths = [t[1] for t in batch]
        assert len(token_ids) == len(lengths)
        # Max for paddings
        max_seq_len_ = max(lengths)
        # Pad token ids
-        pad_idx = self.params.special_tok_ids['pad_token']
+        if self.params.mlm:
            pad_idx = self.params.special_tok_ids['pad_token']
        else:
            pad_idx = self.params.special_tok_ids['unk_token']
        tk_ = [list(t.astype(int)) + [pad_idx]*(max_seq_len_-len(t)) for t in token_ids]
        assert len(tk_) == len(token_ids)
        assert all(len(t) == max_seq_len_ for t in tk_)
-        tk_t = torch.tensor(tk_)                  # (bs, max_seq_len_)
+        tk_t = torch.tensor(tk_)      # (bs, max_seq_len_)
-        lg_t = torch.tensor(lengths.astype(int))  # (bs)
+        lg_t = torch.tensor(lengths)  # (bs)
        return tk_t, lg_t
    def get_batches_iterator(self,
                             batches):
        """
        Return an iterator over batches.
        """
        for sequences_ids in batches:
            token_ids, lengths = self.batch_sequences(self.token_ids[sequences_ids],
                                                    self.lengths[sequences_ids])
            yield (token_ids, lengths)
    def get_iterator(self,
                     seed: int = None):
        """
        Return a data iterator.
        """
        rng = np.random.RandomState(seed)
        n_sequences = len(self)
        indices = np.arange(n_sequences)
        if self.group_by_size:
            indices = indices[np.argsort(self.lengths[indices], kind='mergesort')]
        if self.tokens_per_batch == -1:
            batches = np.array_split(indices, math.ceil(len(indices) * 1. / self.batch_size))
        else:
            assert self.tokens_per_batch > 0
            batch_ids = np.cumsum(self.lengths[indices]) // self.tokens_per_batch
            _, bounds = np.unique(batch_ids, return_index=True)
            batches = [indices[bounds[i]:bounds[i + 1]] for i in range(len(bounds) - 1)]
            if bounds[-1] < len(indices):
                batches.append(indices[bounds[-1]:])
        if self.shuffle:
            rng.shuffle(batches)
        assert n_sequences == sum([len(x) for x in batches])
        assert self.lengths[indices].sum() == sum([self.lengths[x].sum() for x in batches])
        return self.get_batches_iterator(batches=batches)
--- a/examples/distillation/requirements.txt
+++ b/examples/distillation/requirements.txt
@@ -3,4 +3,4 @@ tensorboard>=1.14.0
 tensorboardX==1.8
 psutil==5.6.3
 scipy==1.3.1
-pytorch_transformers==1.2.0
+transformers==2.0.0
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -13,14 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Preprocessing script before training DistilBERT.
+Preprocessing script before distillation.
 """
 import argparse
 import pickle
 import random
 import time
 import numpy as np
-from pytorch_transformers import BertTokenizer, RobertaTokenizer
+from transformers import BertTokenizer, RobertaTokenizer, GPT2Tokenizer
 import logging
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
@@ -32,7 +32,7 @@ def main():
    parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
    parser.add_argument('--file_path', type=str, default='data/dump.txt',
                        help='The path to the data.')
-    parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta'])
+    parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2'])
    parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
                        help="The tokenizer to use.")
    parser.add_argument('--dump_file', type=str, default='data/dump',
@@ -43,10 +43,16 @@ def main():
    logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
    if args.tokenizer_type == 'bert':
        tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]`
        sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]`
    elif args.tokenizer_type == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
-    bos = tokenizer.special_tokens_map['bos_token'] # `[CLS]` for bert, `<s>` for roberta
+        bos = tokenizer.special_tokens_map['cls_token'] # `<s>`
-    sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]` for bert, `</s>` for roberta
+        sep = tokenizer.special_tokens_map['sep_token'] # `</s>`
    elif args.tokenizer_type == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>`
        sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>`    
    logger.info(f'Loading text from {args.file_path}')
    with open(args.file_path, 'r', encoding='utf8') as fp:
--- a/examples/distillation/scripts/extract.py
+++ b/examples/distillation/scripts/extract.py
@@ -0,0 +1,89 @@
 # coding=utf-8
 # Copyright 2019-present, the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Preprocessing script before training the distilled model.
 Specific to RoBERTa -> DistilRoBERTa and GPT2 -> DistilGPT2.
 """
 from transformers import BertForMaskedLM, RobertaForMaskedLM, GPT2LMHeadModel
 import torch
 import argparse
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation")
    parser.add_argument("--model_type", default="roberta", choices=["roberta", "gpt2"])
    parser.add_argument("--model_name", default='roberta-large', type=str)
    parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_roberta_048131723.pth', type=str)
    parser.add_argument("--vocab_transform", action='store_true')
    args = parser.parse_args()
    if args.model_type == 'roberta':
        model = RobertaForMaskedLM.from_pretrained(args.model_name)
        prefix = 'roberta'
    elif args.model_type == 'gpt2':
        model = GPT2LMHeadModel.from_pretrained(args.model_name)
        prefix = 'transformer'
    state_dict = model.state_dict()
    compressed_sd = {}
    ### Embeddings ###
    if args.model_type == 'gpt2':
        for param_name in ['wte.weight', 'wpe.weight']:
            compressed_sd[f'{prefix}.{param_name}'] = state_dict[f'{prefix}.{param_name}']
    else:
        for w in ['word_embeddings', 'position_embeddings', 'token_type_embeddings']:
            param_name = f'{prefix}.embeddings.{w}.weight'
            compressed_sd[param_name] = state_dict[param_name]
        for w in ['weight', 'bias']:
            param_name = f'{prefix}.embeddings.LayerNorm.{w}'
            compressed_sd[param_name] = state_dict[param_name]
    ### Transformer Blocks ###
    std_idx = 0
    for teacher_idx in [0, 2, 4, 7, 9, 11]:
        if args.model_type == 'gpt2':
            for layer in ['ln_1', 'attn.c_attn', 'attn.c_proj', 'ln_2', 'mlp.c_fc', 'mlp.c_proj']:
                for w in ['weight', 'bias']:
                    compressed_sd[f'{prefix}.h.{std_idx}.{layer}.{w}'] = \
                        state_dict[f'{prefix}.h.{teacher_idx}.{layer}.{w}']
            compressed_sd[f'{prefix}.h.{std_idx}.attn.bias'] = state_dict[f'{prefix}.h.{teacher_idx}.attn.bias']
        else:
            for layer in ['attention.self.query', 'attention.self.key', 'attention.self.value',
                        'attention.output.dense', 'attention.output.LayerNorm',
                        'intermediate.dense', 'output.dense', 'output.LayerNorm']:
                for w in ['weight', 'bias']:
                    compressed_sd[f'{prefix}.encoder.layer.{std_idx}.{layer}.{w}'] = \
                        state_dict[f'{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}']
        std_idx += 1
    ### Language Modeling Head ###s
    if args.model_type == 'roberta':
        for layer in ['lm_head.decoder.weight', 'lm_head.bias']:
            compressed_sd[f'{layer}'] = state_dict[f'{layer}']
        if args.vocab_transform:
            for w in ['weight', 'bias']:
                compressed_sd[f'lm_head.dense.{w}'] = state_dict[f'lm_head.dense.{w}']
                compressed_sd[f'lm_head.layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}']
    elif args.model_type == 'gpt2':
        for w in ['weight', 'bias']:
            compressed_sd[f'{prefix}.ln_f.{w}'] = state_dict[f'{prefix}.ln_f.{w}']
        compressed_sd[f'lm_head.weight'] = state_dict[f'lm_head.weight']
    print(f'N layers selected for distillation: {std_idx}')
    print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
    print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
    torch.save(compressed_sd, args.dump_checkpoint)
--- a/examples/distillation/scripts/extract_distilbert.py
+++ b/examples/distillation/scripts/extract_distilbert.py
@@ -14,14 +14,15 @@
 # limitations under the License.
 """
 Preprocessing script before training DistilBERT.
 Specific to BERT -> DistilBERT.
 """
-from pytorch_transformers import BertForMaskedLM, RobertaForMaskedLM
+from transformers import BertForMaskedLM, RobertaForMaskedLM
 import torch
 import argparse
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation")
-    parser.add_argument("--model_type", default="bert", choices=["bert", "roberta"])
+    parser.add_argument("--model_type", default="bert", choices=["bert"])
    parser.add_argument("--model_name", default='bert-base-uncased', type=str)
    parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str)
    parser.add_argument("--vocab_transform", action='store_true')
@@ -31,9 +32,8 @@ if __name__ == '__main__':
    if args.model_type == 'bert':
        model = BertForMaskedLM.from_pretrained(args.model_name)
        prefix = 'bert'
-    elif args.model_type == 'roberta':
+    else:
-        model = RobertaForMaskedLM.from_pretrained(args.model_name)
+        raise ValueError(f'args.model_type should be "bert".')
        prefix = 'roberta'
    state_dict = model.state_dict()
    compressed_sd = {}
@@ -68,20 +68,12 @@ if __name__ == '__main__':
                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
        std_idx += 1
-    if args.model_type == 'bert':
+    compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
-        compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
+    compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
-        compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
+    if args.vocab_transform:
-        if args.vocab_transform:
+        for w in ['weight', 'bias']:
-            for w in ['weight', 'bias']:
+            compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
-                compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
+            compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
                compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
    elif args.model_type == 'roberta':
        compressed_sd[f'vocab_projector.weight'] = state_dict[f'lm_head.decoder.weight']
        compressed_sd[f'vocab_projector.bias'] = state_dict[f'lm_head.bias']
        if args.vocab_transform:
            for w in ['weight', 'bias']:
                compressed_sd[f'vocab_transform.{w}'] = state_dict[f'lm_head.dense.{w}']
                compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}']
    print(f'N layers selected for distillation: {std_idx}')
    print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
--- a/examples/distillation/scripts/token_counts.py
+++ b/examples/distillation/scripts/token_counts.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Preprocessing script before training DistilBERT.
+Preprocessing script before training the distilled model.
 """
 from collections import Counter
 import argparse
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Training DistilBERT.
+Training the distilled model.
 Supported architectures include: BERT -> DistilBERT, RoBERTa -> DistilRoBERTa, GPT2 -> DistilGPT2.
 """
 import os
 import argparse
@@ -23,68 +24,96 @@ import shutil
 import numpy as np
 import torch
-from pytorch_transformers import BertTokenizer, BertForMaskedLM, RobertaTokenizer, RobertaForMaskedLM
+from transformers import BertConfig, BertForMaskedLM, BertTokenizer
-from pytorch_transformers import DistilBertForMaskedLM, DistilBertConfig
+from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizer
 from transformers import DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer
 from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
 from distiller import Distiller
 from utils import git_log, logger, init_gpu_params, set_seed
-from dataset import Dataset
+from lm_seqs_dataset import LmSeqsDataset
 MODEL_CLASSES = {
    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)
 }
 def sanity_checks(args):
    """
    A bunch of args sanity checks to perform even starting...
    """
    assert (args.mlm and args.alpha_mlm > 0.) or (not args.mlm and args.alpha_mlm == 0.)
    assert (args.alpha_mlm > 0. and args.alpha_clm == 0.) or (args.alpha_mlm == 0. and args.alpha_clm > 0.)
    if args.mlm:
        assert os.path.isfile(args.token_counts)
        assert (args.student_type in ['roberta', 'distilbert']) and (args.teacher_type in ['roberta', 'bert'])
    else:
        assert (args.student_type in ['gpt2']) and (args.teacher_type in ['gpt2'])
    assert args.teacher_type == args.student_type or (args.student_type=='distilbert' and args.teacher_type=='bert')
    assert os.path.isfile(args.student_config)
    if args.student_pretrained_weights is not None:
        assert os.path.isfile(args.student_pretrained_weights)
    if args.freeze_token_type_embds: assert args.student_type in ['roberta']
    assert args.alpha_ce >= 0.
    assert args.alpha_mlm >= 0.
    assert args.alpha_clm >= 0.
    assert args.alpha_mse >= 0.
    assert args.alpha_cos >= 0.
    assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.
 def freeze_pos_embeddings(student, args):
    if args.student_type == 'roberta':
        student.roberta.embeddings.position_embeddings.weight.requires_grad = False
    elif args.student_type == 'gpt2':
        student.transformer.wpe.weight.requires_grad = False
 def freeze_token_type_embeddings(student, args):
    if args.student_type == 'roberta':
        student.roberta.embeddings.token_type_embeddings.weight.requires_grad = False
 def main():
    parser = argparse.ArgumentParser(description="Training")
    parser.add_argument("--force", action='store_true',
                        help="Overwrite dump_path if it already exists.")
    parser.add_argument("--dump_path", type=str, required=True,
                        help="The output directory (log, checkpoints, parameters, etc.)")
    parser.add_argument("--data_file", type=str, required=True,
                        help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.")
    parser.add_argument("--token_counts", type=str, required=True,
                        help="The token counts in the data_file for MLM.")
    parser.add_argument("--force", action='store_true',
                        help="Overwrite dump_path if it already exists.")
-    parser.add_argument("--vocab_size", default=30522, type=int,
+    parser.add_argument("--student_type", type=str, choices=["distilbert", "roberta", "gpt2"], required=True,
-                        help="The vocabulary size.")
+                        help="The student type (DistilBERT, RoBERTa).")
-    parser.add_argument("--max_position_embeddings", default=512, type=int,
+    parser.add_argument("--student_config", type=str, required=True,
-                        help="Maximum sequence length we can model (including [CLS] and [SEP]).")
+                        help="Path to the student configuration.")
-    parser.add_argument("--sinusoidal_pos_embds", action='store_false',
+    parser.add_argument("--student_pretrained_weights", default=None, type=str,
                        help="If true, the position embeddings are simply fixed with sinusoidal embeddings.")
    parser.add_argument("--n_layers", default=6, type=int,
                        help="Number of Transformer blocks.")
    parser.add_argument("--n_heads", default=12, type=int,
                        help="Number of heads in the self-attention module.")
    parser.add_argument("--dim", default=768, type=int,
                        help="Dimension through the network. Must be divisible by n_heads")
    parser.add_argument("--hidden_dim", default=3072, type=int,
                        help="Intermediate dimension in the FFN.")
    parser.add_argument("--dropout", default=0.1, type=float,
                        help="Dropout.")
    parser.add_argument("--attention_dropout", default=0.1, type=float,
                        help="Dropout in self-attention.")
    parser.add_argument("--activation", default='gelu', type=str,
                        help="Activation to use in self-attention")
    parser.add_argument("--tie_weights_", action='store_false',
                        help="If true, we tie the embeddings matrix with the projection over the vocabulary matrix. Default is true.")
    parser.add_argument("--from_pretrained_weights", default=None, type=str,
                        help="Load student initialization checkpoint.")
-    parser.add_argument("--from_pretrained_config", default=None, type=str,
+
-                        help="Load student initialization architecture config.")
+    parser.add_argument("--teacher_type", choices=["bert", "roberta", "gpt2"], required=True,
    parser.add_argument("--teacher_type", default="bert", choices=["bert", "roberta"],
                        help="Teacher type (BERT, RoBERTa).")
-    parser.add_argument("--teacher_name", default="bert-base-uncased", type=str,
+    parser.add_argument("--teacher_name", type=str, required=True,
                        help="The teacher model.")
    parser.add_argument("--temperature", default=2., type=float,
                        help="Temperature for the softmax temperature.")
    parser.add_argument("--alpha_ce", default=0.5, type=float,
                        help="Linear weight for the distillation loss. Must be >=0.")
-    parser.add_argument("--alpha_mlm", default=0.5, type=float,
+    parser.add_argument("--alpha_mlm", default=0.0, type=float,
-                        help="Linear weight for the MLM loss. Must be >=0.")
+                        help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.")
    parser.add_argument("--alpha_clm", default=0.5, type=float,
                        help="Linear weight for the CLM loss. Must be >=0.")
    parser.add_argument("--alpha_mse", default=0.0, type=float,
                        help="Linear weight of the MSE loss. Must be >=0.")
    parser.add_argument("--alpha_cos", default=0.0, type=float,
                        help="Linear weight of the cosine embedding loss. Must be >=0.")
    parser.add_argument("--mlm", action="store_true",
                        help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM.")
    parser.add_argument("--mlm_mask_prop", default=0.15, type=float,
                        help="Proportion of tokens for which we need to make a prediction.")
    parser.add_argument("--word_mask", default=0.8, type=float,
@@ -95,17 +124,20 @@ def main():
                        help="Proportion of tokens to randomly replace.")
    parser.add_argument("--mlm_smoothing", default=0.7, type=float,
                        help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).")
    parser.add_argument("--token_counts", type=str,
                        help="The token counts in the data_file for MLM.")
    parser.add_argument("--restrict_ce_to_mask", action='store_true',
                        help="If true, compute the distilation loss only the [MLM] prediction distribution.")
    parser.add_argument("--freeze_pos_embs", action="store_true",
                        help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.")
    parser.add_argument("--freeze_token_type_embds", action="store_true",
                        help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.")
    parser.add_argument("--n_epoch", type=int, default=3,
                        help="Number of pass on the whole dataset.")
    parser.add_argument("--batch_size", type=int, default=5,
                        help="Batch size (for each process).")
    parser.add_argument("--tokens_per_batch", type=int, default=-1,
                        help="If specified, modify the batches so that they have approximately this number of tokens.")
    parser.add_argument("--shuffle", action='store_false',
                        help="If true, shuffle the sequence order. Default is true.")
    parser.add_argument("--group_by_size", action='store_false',
                        help="If true, group sequences that have similar length into the same batch. Default is true.")
@@ -141,6 +173,7 @@ def main():
    parser.add_argument("--checkpoint_interval", type=int, default=4000,
                        help="Checkpoint interval.")
    args = parser.parse_args()
    sanity_checks(args)
    ## ARGS ##
@@ -164,21 +197,19 @@ def main():
        with open(os.path.join(args.dump_path, 'parameters.json'), 'w') as f:
            json.dump(vars(args), f, indent=4)
        git_log(args.dump_path)
    assert (args.from_pretrained_weights is None and args.from_pretrained_config is None) or \
           (args.from_pretrained_weights is not None and args.from_pretrained_config is not None)
    student_config_class, student_model_class, _ = MODEL_CLASSES[args.student_type]
    teacher_config_class, teacher_model_class, teacher_tokenizer_class = MODEL_CLASSES[args.teacher_type]
    ### TOKENIZER ###
-    if args.teacher_type == 'bert':
+    tokenizer = teacher_tokenizer_class.from_pretrained(args.teacher_name)
        tokenizer = BertTokenizer.from_pretrained(args.teacher_name)
    elif args.teacher_type == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained(args.teacher_name)
    special_tok_ids = {}
    for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
        idx = tokenizer.all_special_tokens.index(tok_symbol)
        special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
    logger.info(f'Special tokens {special_tok_ids}')
    args.special_tok_ids = special_tok_ids
    args.max_model_input_size = tokenizer.max_model_input_sizes[args.teacher_name]
    ## DATA LOADER ##
@@ -187,35 +218,34 @@ def main():
        data = pickle.load(fp)
-    assert os.path.isfile(args.token_counts)
+    if args.mlm:
-    logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)')
+        logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)')
-    with open(args.token_counts, 'rb') as fp:
+        with open(args.token_counts, 'rb') as fp:
-        counts = pickle.load(fp)
+            counts = pickle.load(fp)
-        assert len(counts) == args.vocab_size
+        
-    token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
+        token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
-    for idx in special_tok_ids.values():
+        for idx in special_tok_ids.values():
-        token_probs[idx] = 0.  # do not predict special tokens
+            token_probs[idx] = 0.  # do not predict special tokens
-    token_probs = torch.from_numpy(token_probs)
+        token_probs = torch.from_numpy(token_probs)
    else:
        token_probs = None
-    train_dataloader = Dataset(params=args, data=data)
+    train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
    logger.info(f'Data loader created.')
    ## STUDENT ##
-    if args.from_pretrained_weights is not None:
+    logger.info(f'Loading student config from {args.student_config}')
-        assert os.path.isfile(args.from_pretrained_weights)
+    stu_architecture_config = student_config_class.from_pretrained(args.student_config)
-        assert os.path.isfile(args.from_pretrained_config)
+    stu_architecture_config.output_hidden_states = True
-        logger.info(f'Loading pretrained weights from {args.from_pretrained_weights}')
+
-        logger.info(f'Loading pretrained config from {args.from_pretrained_config}')
+    if args.student_pretrained_weights is not None:
-        stu_architecture_config = DistilBertConfig.from_json_file(args.from_pretrained_config)
+        logger.info(f'Loading pretrained weights from {args.student_pretrained_weights}')
-        stu_architecture_config.output_hidden_states = True
+        student = student_model_class.from_pretrained(args.student_pretrained_weights,
-        student = DistilBertForMaskedLM.from_pretrained(args.from_pretrained_weights,
+                                                      config=stu_architecture_config)
                                                        config=stu_architecture_config)
    else:
-        args.vocab_size_or_config_json_file = args.vocab_size
+        student = student_model_class(stu_architecture_config)
        stu_architecture_config = DistilBertConfig(**vars(args), output_hidden_states=True)
        student = DistilBertForMaskedLM(stu_architecture_config)
    if args.n_gpu > 0:
@@ -224,18 +254,31 @@ def main():
    ## TEACHER ##
-    if args.teacher_type == 'bert':
+    teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
        teacher = BertForMaskedLM.from_pretrained(args.teacher_name, output_hidden_states=True)
    elif args.teacher_type == 'roberta':
        teacher = RobertaForMaskedLM.from_pretrained(args.teacher_name, output_hidden_states=True)
    if args.n_gpu > 0:
        teacher.to(f'cuda:{args.local_rank}')
    logger.info(f'Teacher loaded from {args.teacher_name}.')
    ## FREEZING ##
    if args.freeze_pos_embs:
        freeze_pos_embeddings(student, args)
    if args.freeze_token_type_embds:
        freeze_token_type_embeddings(student, args)
    ## SANITY CHECKS ##
    assert student.config.vocab_size == teacher.config.vocab_size
    assert student.config.hidden_size == teacher.config.hidden_size
    assert student.config.max_position_embeddings == teacher.config.max_position_embeddings
    if args.mlm:
        assert token_probs.size(0) == stu_architecture_config.vocab_size
    ## DISTILLER ##
    torch.cuda.empty_cache()
    distiller = Distiller(params=args,
-                          dataloader=train_dataloader,
+                          dataset=train_lm_seq_dataset,
                          token_probs=token_probs,
                          student=student,
                          teacher=teacher)
--- a/examples/distillation/training_configs/distilbert-base-uncased.json
+++ b/examples/distillation/training_configs/distilbert-base-uncased.json
@@ -0,0 +1,15 @@
 {
 	"activation": "gelu",
 	"attention_dropout": 0.1,
 	"dim": 768,
 	"dropout": 0.1,
 	"hidden_dim": 3072,
 	"initializer_range": 0.02,
 	"max_position_embeddings": 512,
 	"n_heads": 12,
 	"n_layers": 6,
 	"sinusoidal_pos_embds": true,
 	"tie_weights_": true,
 	"vocab_size": 30522
  }
--- a/examples/distillation/training_configs/distilgpt2.json
+++ b/examples/distillation/training_configs/distilgpt2.json
@@ -0,0 +1,10 @@
 {
 	"initializer_range": 0.02,
 	"layer_norm_epsilon": 0.00001,
 	"n_ctx": 1024,
 	"n_embd": 768,
 	"n_head": 12,
 	"n_layer": 6,
 	"n_positions": 1024,
 	"vocab_size": 50257
 }
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -32,7 +32,7 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse
 from torch.utils.data.distributed import DistributedSampler
 from torch.nn import CrossEntropyLoss, MSELoss
-from pytorch_transformers import (WEIGHTS_NAME,
+from transformers import (WEIGHTS_NAME,
                                  BertConfig, BertForSequenceClassification, BertTokenizer,
                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer,
                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -26,12 +26,13 @@ import torch
 import torch.nn.functional as F
 import numpy as np
-from pytorch_transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig
+from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig
-from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
-from pytorch_transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
+from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
-from pytorch_transformers import XLNetLMHeadModel, XLNetTokenizer
+from transformers import XLNetLMHeadModel, XLNetTokenizer
-from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
+from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
 from transformers import XLMWithLMHeadModel, XLMTokenizer
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
@@ -41,13 +42,14 @@ logger = logging.getLogger(__name__)
 MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig)), ())
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig)), ())
 MODEL_CLASSES = {
    'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
    'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    'xlnet': (XLNetLMHeadModel, XLNetTokenizer),
    'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer),
    'xlm': (XLMWithLMHeadModel, XLMTokenizer),
 }
 # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
@@ -103,7 +105,8 @@ def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')
    return logits
-def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, is_xlnet=False, device='cpu'):
+def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, is_xlnet=False,
                    xlm_lang=None, device='cpu'):
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context
@@ -121,6 +124,9 @@ def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=
                target_mapping[0, 0, -1] = 1.0  # predict last token
                inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}
            if xlm_lang is not None:
                inputs["langs"] = torch.tensor([xlm_lang] * inputs["input_ids"].shape[1]).view(1, -1)
            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
            next_token_logits = outputs[0][0, -1, :] / temperature
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
@@ -137,6 +143,7 @@ def main():
                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
    parser.add_argument("--prompt", type=str, default="")
    parser.add_argument("--padding_text", type=str, default="")
    parser.add_argument("--xlm_lang", type=str, default="", help="Optional language when used with the XLM model.")
    parser.add_argument("--length", type=int, default=20)
    parser.add_argument("--temperature", type=float, default=1.0)
    parser.add_argument("--top_k", type=int, default=0)
@@ -170,6 +177,18 @@ def main():
    print(args)
    while True:
        xlm_lang = None
        # XLM Language usage detailed in the issues #1414
        if args.model_type in ["xlm"] and hasattr(tokenizer, 'lang2id') and hasattr(model.config, 'use_lang_emb') \
                and model.config.use_lang_emb:
            if args.xlm_lang:
                language = args.xlm_lang
            else:
                language = None
                while language not in tokenizer.lang2id.keys():
                    language = input("Using XLM. Select language in " + str(list(tokenizer.lang2id.keys())) + " >>> ")
            xlm_lang = tokenizer.lang2id[language]
        raw_text = args.prompt if args.prompt else input("Model prompt >>> ")
        if args.model_type in ["transfo-xl", "xlnet"]:
            # Models with memory likes to have a long prompt for short inputs.
@@ -182,12 +201,15 @@ def main():
            temperature=args.temperature,
            top_k=args.top_k,
            top_p=args.top_p,
            device=args.device,
            is_xlnet=bool(args.model_type == "xlnet"),
            xlm_lang=xlm_lang,
            device=args.device,
        )
        out = out[0, len(context_tokens):].tolist()
-        text = tokenizer.decode(out, clean_up_tokenization_spaces=True)
+
        text = tokenizer.decode(out, clean_up_tokenization_spaces=True, skip_special_tokens=True)
        text = text[: text.find(args.stop_token) if args.stop_token else None]
        print(text)
        if args.prompt:
            break
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -31,7 +31,7 @@ from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                  BertForSequenceClassification, BertTokenizer,
                                  RobertaConfig,
                                  RobertaForSequenceClassification,
@@ -39,22 +39,29 @@ from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
                                  XLMConfig, XLMForSequenceClassification,
                                  XLMTokenizer, XLNetConfig,
                                  XLNetForSequenceClassification,
-                                  XLNetTokenizer)
+                                  XLNetTokenizer,
                                  DistilBertConfig,
                                  DistilBertForSequenceClassification,
                                  DistilBertTokenizer)
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
-from utils_glue import (compute_metrics, convert_examples_to_features,
+from transformers import glue_compute_metrics as compute_metrics
-                        output_modes, processors)
+from transformers import glue_output_modes as output_modes
 from transformers import glue_processors as processors
 from transformers import glue_convert_examples_to_features as convert_examples_to_features
 logger = logging.getLogger(__name__)
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig)), ())
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, 
                                                                                RobertaConfig, DistilBertConfig)), ())
 MODEL_CLASSES = {
    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
 }
@@ -128,10 +135,11 @@ def train(args, train_dataset, model, tokenizer):
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM and RoBERTa don't use segment_ids
                      'labels':         batch[3]}
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            if args.n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu parallel training
@@ -148,8 +156,8 @@ def train(args, train_dataset, model, tokenizer):
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
@@ -218,8 +226,9 @@ def evaluate(args, model, tokenizer, prefix=""):
            with torch.no_grad():
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM and RoBERTa don't use segment_ids
                          'labels':         batch[3]}
                if args.model_type != 'distilbert':
                    inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]
@@ -272,15 +281,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1] 
        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
+        features = convert_examples_to_features(examples,
-            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
+                                                tokenizer,
-            cls_token=tokenizer.cls_token,
+                                                label_list=label_list,
-            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
+                                                max_length=args.max_seq_length,
-            sep_token=tokenizer.sep_token,
+                                                output_mode=output_mode,
-            sep_token_extra=bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
+                                                pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
+                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+                                                pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -291,14 +299,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
-        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    elif output_mode == "regression":
-        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
-    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset
@@ -478,7 +486,7 @@ def main():
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -35,11 +35,12 @@ from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
-from pytorch_transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
+from transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
                                  BertConfig, BertForMaskedLM, BertTokenizer,
                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
-                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
+                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
                                  DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
 logger = logging.getLogger(__name__)
@@ -49,7 +50,8 @@ MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
-    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
+    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
 }
@@ -57,7 +59,7 @@ class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path='train', block_size=512):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
-        cached_features_file = os.path.join(directory, f'cached_lm_{block_size}_{filename}')
+        cached_features_file = os.path.join(directory, 'cached_lm_{}_{}'.format(block_size, filename))
        if os.path.exists(cached_features_file):
            logger.info("Loading features from cached file %s", cached_features_file)
@@ -72,9 +74,8 @@ class TextDataset(Dataset):
            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
-            while len(tokenized_text) >= block_size:  # Truncate in block of block_size
+            for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
-                self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size]))
+                self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[i:i+block_size]))
                tokenized_text = tokenized_text[block_size:]
            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
            # If your dataset is small, first you should loook for a bigger one :-) and second you
            # can change this behavior by adding (model specific) padding.
@@ -186,7 +187,7 @@ def train(args, train_dataset, model, tokenizer):
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
@@ -380,7 +381,7 @@ def main():
    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
    args = parser.parse_args()
-    if args.model_type in ["bert", "roberta"] and not args.mlm:
+    if args.model_type in ["bert", "roberta", "distilbert"] and not args.mlm:
        raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
                         "flag (masked language modeling).")
    if args.eval_data_file is None and args.do_eval:
@@ -479,7 +480,7 @@ def main():
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -32,13 +32,13 @@ from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                  BertForMultipleChoice, BertTokenizer,
                                  XLNetConfig, XLNetForMultipleChoice,
                                  XLNetTokenizer, RobertaConfig,
                                  RobertaForMultipleChoice, RobertaTokenizer)
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
 from utils_multiple_choice import (convert_examples_to_features, processors)
@@ -141,7 +141,7 @@ def train(args, train_dataset, model, tokenizer):
                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                      'labels':         batch[3]}
            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            if args.n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu parallel training
@@ -508,7 +508,7 @@ def main():
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
@@ -524,7 +524,7 @@ def main():
        checkpoints = [args.output_dir]
        # if args.eval_all_checkpoints: # can not use this to do test!!
        #     checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-        #     logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        #     logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -32,14 +32,15 @@ from tqdm import tqdm, trange
 from tensorboardX import SummaryWriter
-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                  BertForQuestionAnswering, BertTokenizer,
                                  XLMConfig, XLMForQuestionAnswering,
                                  XLMTokenizer, XLNetConfig,
                                  XLNetForQuestionAnswering,
-                                  XLNetTokenizer)
+                                  XLNetTokenizer,
                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
 from utils_squad import (read_squad_examples, convert_examples_to_features,
                         RawResult, write_predictions,
@@ -59,6 +60,7 @@ MODEL_CLASSES = {
    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
 }
 def set_seed(args):
@@ -140,7 +142,7 @@ def train(args, train_dataset, model, tokenizer):
                inputs.update({'cls_index': batch[5],
                               'p_mask':       batch[6]})
            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            if args.n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
@@ -508,7 +510,7 @@ def main():
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -0,0 +1,40 @@
 import tensorflow as tf
 import tensorflow_datasets
 from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features, BertForSequenceClassification
 # Load dataset, tokenizer, model from pretrained model/vocabulary
 tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
 model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
 data = tensorflow_datasets.load('glue/mrpc')
 # Prepare dataset for GLUE as a tf.data.Dataset instance
 train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc')
 valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc')
 train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
 valid_dataset = valid_dataset.batch(64)
 # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
 optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
 loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
 model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
 # Train and evaluate using tf.keras.Model.fit()
 history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
                    validation_data=valid_dataset, validation_steps=7)
 # Load the TensorFlow model in PyTorch for inspection
 model.save_pretrained('./save/')
 pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
 # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
 sentence_0 = "This research was consistent with his findings."
 sentence_1 = "His findings were compatible with this research."
 sentence_2 = "His findings were not compatible with this research."
 inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
 inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
 pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
 pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
 print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
 print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
--- a/examples/utils_squad.py
+++ b/examples/utils_squad.py
@@ -24,7 +24,7 @@ import math
 import collections
 from io import open
-from pytorch_transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
+from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
 # Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
 from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,7 +1,7 @@
-from pytorch_transformers import (
+from transformers import (
    AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
 )
-from pytorch_transformers.file_utils import add_start_docstrings
+from transformers.file_utils import add_start_docstrings
 dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
@@ -11,12 +11,12 @@ def config(*args, **kwargs):
                # Using torch.hub !
                import torch
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
+                config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+                config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
+                config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/my_configuration.json')
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
+                config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
                assert config.output_attention == True
-                config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
+                config, unused_kwargs = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
                assert config.output_attention == True
                assert unused_kwargs == {'foo': False}
@@ -31,8 +31,8 @@ def tokenizer(*args, **kwargs):
        # Using torch.hub !
        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
+        tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+        tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
    """
@@ -45,13 +45,13 @@ def model(*args, **kwargs):
            # Using torch.hub !
            import torch
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            model = torch.hub.load('huggingface/transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
        """
@@ -63,13 +63,13 @@ def modelWithLMHead(*args, **kwargs):
        # Using torch.hub !
        import torch
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
        assert model.config.output_attention == True
        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
    """
    return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
@@ -81,13 +81,13 @@ def modelForSequenceClassification(*args, **kwargs):
            # Using torch.hub !
            import torch
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
        """
@@ -100,13 +100,13 @@ def modelForQuestionAnswering(*args, **kwargs):
        # Using torch.hub !
        import torch
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
        assert model.config.output_attention == True
        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
    """
    return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
--- a/pytorch_transformers/init.py
+++ b/pytorch_transformers/init.py
@@ -1,75 +0,0 @@
 __version__ = "1.2.0"
 # Work around to update TensorFlow's absl.logging threshold which alters the
 # default Python logging output behavior when present.
 # see: https://github.com/abseil/abseil-py/issues/99
 # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
 try:
    import absl.logging
    absl.logging.set_verbosity('info')
    absl.logging.set_stderrthreshold('info')
    absl.logging._warn_preinit_stderr = False
 except:
    pass
 # Tokenizer
 from .tokenization_utils import (PreTrainedTokenizer)
 from .tokenization_auto import AutoTokenizer
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_roberta import RobertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
 # Configurations
 from .configuration_utils import PretrainedConfig
 from .configuration_auto import AutoConfig
 from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 # Modeling
 from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
 from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
                            AutoModelWithLMHead)
 from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
                            BertForMaskedLM, BertForNextSentencePrediction,
                            BertForSequenceClassification, BertForMultipleChoice,
                            BertForTokenClassification, BertForQuestionAnswering,
                            load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
                              OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
                              load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
                                  load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
                            GPT2LMHeadModel, GPT2DoubleHeadsModel,
                            load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
                             XLNetForSequenceClassification, XLNetForQuestionAnswering, XLNetForMultipleChoice,
                             load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
                           XLMWithLMHeadModel, XLMForSequenceClassification,
                           XLMForQuestionAnswering, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
                               RobertaForMultipleChoice, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
                               DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
                               DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
 # Optimization
 from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
                           WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
 # Files and general utilities
 from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
                         cached_path, add_start_docstrings, add_end_docstrings,
                         WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,3 @@
 # PyTorch
 torch>=1.0.0
 # progress bars in model download and training scripts
 tqdm
 # Accessing files from S3 directly.
--- a/setup.py
+++ b/setup.py
@@ -13,11 +13,11 @@ To create the package for pypi.
 4. Build both the sources and the wheel. Do not change anything in setup.py between
   creating the wheel and the source distribution (obviously).
-   For the wheel, run: "python setup.py bdist_wheel" in the top level allennlp directory.
+   For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
   (this will build a wheel for the python version you use to build it - make sure you use python 3.x).
   For the sources, run: "python setup.py sdist"
-   You should now have a /dist directory with both .whl and .tar.gz source versions of allennlp.
+   You should now have a /dist directory with both .whl and .tar.gz source versions.
 5. Check that everything looks correct by uploading the package to the pypi test server:
@@ -25,7 +25,7 @@ To create the package for pypi.
   (pypi suggest using twine as other methods upload files via plaintext.)
   Check that you can install it in a virtualenv by running:
-   pip install -i https://testpypi.python.org/pypi pytorch-transformers
+   pip install -i https://testpypi.python.org/pypi transformers
 6. Upload the final version to actual pypi:
   twine upload dist/* -r pypi
@@ -37,20 +37,19 @@ from io import open
 from setuptools import find_packages, setup
 setup(
-    name="pytorch_transformers",
+    name="transformers",
-    version="1.2.0",
+    version="2.0.0",
-    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors",
+    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
    author_email="thomas@huggingface.co",
-    description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",
+    description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
    long_description=open("README.md", "r", encoding='utf-8').read(),
    long_description_content_type="text/markdown",
-    keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU',
+    keywords='NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU',
    license='Apache',
-    url="https://github.com/huggingface/pytorch-transformers",
+    url="https://github.com/huggingface/transformers",
    packages=find_packages(exclude=["*.tests", "*.tests.*",
                                    "tests.*", "tests"]),
-    install_requires=['torch>=1.0.0',
+    install_requires=['numpy',
                      'numpy',
                      'boto3',
                      'requests',
                      'tqdm',
@@ -59,7 +58,7 @@ setup(
                      'sacremoses'],
    entry_points={
      'console_scripts': [
-        "pytorch_transformers=pytorch_transformers.__main__:main",
+        "transformers=transformers.__main__:main",
      ]
    },
    # python_requires='>=3.5.0',
--- a/transformers/init.py
+++ b/transformers/init.py
@@ -0,0 +1,165 @@
 __version__ = "2.0.0"
 # Work around to update TensorFlow's absl.logging threshold which alters the
 # default Python logging output behavior when present.
 # see: https://github.com/abseil/abseil-py/issues/99
 # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
 try:
    import absl.logging
    absl.logging.set_verbosity('info')
    absl.logging.set_stderrthreshold('info')
    absl.logging._warn_preinit_stderr = False
 except:
    pass
 import logging
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 # Files and general utilities
 from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
                         cached_path, add_start_docstrings, add_end_docstrings,
                         WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME,
                         is_tf_available, is_torch_available)
 from .data import (is_sklearn_available,
                   InputExample, InputFeatures, DataProcessor,
                   glue_output_modes, glue_convert_examples_to_features,
                   glue_processors, glue_tasks_num_labels)
 if is_sklearn_available():
    from .data import glue_compute_metrics
 # Tokenizers
 from .tokenization_utils import (PreTrainedTokenizer)
 from .tokenization_auto import AutoTokenizer
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_roberta import RobertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
 # Configurations
 from .configuration_utils import PretrainedConfig
 from .configuration_auto import AutoConfig
 from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 # Modeling
 if is_torch_available():
    from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
    from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
                                AutoModelWithLMHead)
    from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
                                BertForMaskedLM, BertForNextSentencePrediction,
                                BertForSequenceClassification, BertForMultipleChoice,
                                BertForTokenClassification, BertForQuestionAnswering,
                                load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
                                OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
                                load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
                                    load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
                                GPT2LMHeadModel, GPT2DoubleHeadsModel,
                                load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
                                XLNetForSequenceClassification, XLNetForMultipleChoice,
                                XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering,
                                load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
                            XLMWithLMHeadModel, XLMForSequenceClassification,
                            XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
                            XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_roberta import (RobertaForMaskedLM, RobertaModel,
                                RobertaForSequenceClassification, RobertaForMultipleChoice,
                                ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
                                DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
                                DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
    # Optimization
    from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
                               WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
 # TensorFlow
 if is_tf_available():
    from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary
    from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
                                   TFAutoModelWithLMHead)
    from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings,
                                   TFBertModel, TFBertForPreTraining,
                                   TFBertForMaskedLM, TFBertForNextSentencePrediction,
                                   TFBertForSequenceClassification, TFBertForMultipleChoice,
                                   TFBertForTokenClassification, TFBertForQuestionAnswering,
                                   load_bert_pt_weights_in_tf2,
                                   TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_gpt2 import (TFGPT2PreTrainedModel, TFGPT2MainLayer,
                                   TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel,
                                   load_gpt2_pt_weights_in_tf2,
                                   TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_openai import (TFOpenAIGPTPreTrainedModel, TFOpenAIGPTMainLayer,
                                     TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel,
                                     load_openai_gpt_pt_weights_in_tf2,
                                     TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_transfo_xl import (TFTransfoXLPreTrainedModel, TFTransfoXLMainLayer,
                                         TFTransfoXLModel, TFTransfoXLLMHeadModel,
                                         load_transfo_xl_pt_weights_in_tf2,
                                         TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
                                    TFXLNetModel, TFXLNetLMHeadModel,
                                    TFXLNetForSequenceClassification,
                                    TFXLNetForQuestionAnsweringSimple,
                                    load_xlnet_pt_weights_in_tf2,
                                    TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_xlm import (TFXLMPreTrainedModel, TFXLMMainLayer,
                                  TFXLMModel, TFXLMWithLMHeadModel,
                                  TFXLMForSequenceClassification,
                                  TFXLMForQuestionAnsweringSimple,
                                  load_xlm_pt_weights_in_tf2,
                                  TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_roberta import (TFRobertaPreTrainedModel, TFRobertaMainLayer,
                                      TFRobertaModel, TFRobertaForMaskedLM,
                                      TFRobertaForSequenceClassification,
                                      load_roberta_pt_weights_in_tf2,
                                      TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
                                         TFDistilBertModel, TFDistilBertForMaskedLM,
                                         TFDistilBertForSequenceClassification,
                                         TFDistilBertForQuestionAnswering,
                                         load_distilbert_pt_weights_in_tf2,
                                         TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
 # TF 2.0 <=> PyTorch conversion utilities
 if is_tf_available() and is_torch_available():
    from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
                                            load_pytorch_checkpoint_in_tf2_model,
                                            load_pytorch_weights_in_tf2_model,
                                            load_pytorch_model_in_tf2_model,
                                            load_tf2_checkpoint_in_pytorch_model,
                                            load_tf2_weights_in_pytorch_model,
                                            load_tf2_model_in_pytorch_model)
 if not is_tf_available() and not is_torch_available():
    logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found."
                   "Models won't be available and only tokenizers, configuration"
                   "and file/data utilities can be used.")
--- a/pytorch_transformers/main.py
+++ b/pytorch_transformers/main.py
@@ -3,36 +3,37 @@ def main():
    import sys
    if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
        print(
-        "Should be used as one of: \n"
+        "This command line utility let you convert original (author released) model checkpoint to pytorch.\n"
-        ">> pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
+        "It should be used as one of: \n"
-        ">> pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
+        ">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
-        ">> pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
+        ">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
-        ">> pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
+        ">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
-        ">> pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
+        ">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
-        ">> pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
+        ">> transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
        ">> transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
    else:
        if sys.argv[1] == "bert":
            try:
-                from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+                from .convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
            except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                    "In that case, it requires TensorFlow to be installed. Please see "
                    "https://www.tensorflow.org/install/ for installation instructions.")
                raise
            if len(sys.argv) != 5:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+                print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
            else:
                PYTORCH_DUMP_OUTPUT = sys.argv.pop()
                TF_CONFIG = sys.argv.pop()
                TF_CHECKPOINT = sys.argv.pop()
                convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
        elif sys.argv[1] == "gpt":
-            from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
+            from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
            if len(sys.argv) < 4 or len(sys.argv) > 5:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
+                print("Should be used as `transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
            else:
                OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
                PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -45,15 +46,15 @@ def main():
                                                    PYTORCH_DUMP_OUTPUT)
        elif sys.argv[1] == "transfo_xl":
            try:
-                from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
+                from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
            except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                    "In that case, it requires TensorFlow to be installed. Please see "
                    "https://www.tensorflow.org/install/ for installation instructions.")
                raise
            if len(sys.argv) < 4 or len(sys.argv) > 5:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+                print("Should be used as `transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
            else:
                if 'ckpt' in sys.argv[2].lower():
                    TF_CHECKPOINT = sys.argv[2]
@@ -69,16 +70,16 @@ def main():
                convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
        elif sys.argv[1] == "gpt2":
            try:
-                from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
+                from .convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
            except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                    "In that case, it requires TensorFlow to be installed. Please see "
                    "https://www.tensorflow.org/install/ for installation instructions.")
                raise
            if len(sys.argv) < 4 or len(sys.argv) > 5:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+                print("Should be used as `transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
            else:
                TF_CHECKPOINT = sys.argv[2]
                PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -89,16 +90,16 @@ def main():
                convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
        elif sys.argv[1] == "xlnet":
            try:
-                from .convert_xlnet_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
+                from .convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
            except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                    "In that case, it requires TensorFlow to be installed. Please see "
                    "https://www.tensorflow.org/install/ for installation instructions.")
                raise
            if len(sys.argv) < 5 or len(sys.argv) > 6:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
+                print("Should be used as `transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
            else:
                TF_CHECKPOINT = sys.argv[2]
                TF_CONFIG = sys.argv[3]
@@ -113,11 +114,11 @@ def main():
                                                    PYTORCH_DUMP_OUTPUT,
                                                    FINETUNING_TASK)
        elif sys.argv[1] == "xlm":
-            from .convert_xlm_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
+            from .convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
            if len(sys.argv) != 4:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
+                print("Should be used as `transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
            else:
                XLM_CHECKPOINT_PATH = sys.argv[2]
                PYTORCH_DUMP_OUTPUT = sys.argv[3]
--- a/pytorch_transformers/configuration_auto.py
+++ b/pytorch_transformers/configuration_auto.py
@@ -31,7 +31,7 @@ logger = logging.getLogger(__name__)
 class AutoConfig(object):
-    r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
+    r""":class:`~transformers.AutoConfig` is a generic configuration class
        that will be instantiated as one of the configuration classes of the library
        when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
        class method.
@@ -76,7 +76,7 @@ class AutoConfig(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
            cache_dir: (`optional`) string:
--- a/pytorch_transformers/configuration_bert.py
+++ b/pytorch_transformers/configuration_bert.py
@@ -45,7 +45,7 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class BertConfig(PretrainedConfig):
    r"""
-        :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
+        :class:`~transformers.BertConfig` is the configuration class to store the configuration of a
        `BertModel`.
@@ -58,7 +58,7 @@ class BertConfig(PretrainedConfig):
            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
                layer in the Transformer encoder.
            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
            hidden_dropout_prob: The dropout probabilitiy for all fully connected
                layers in the embeddings, encoder, and pooler.
            attention_probs_dropout_prob: The dropout ratio for the attention
--- a/pytorch_transformers/configuration_distilbert.py
+++ b/pytorch_transformers/configuration_distilbert.py
@@ -37,7 +37,7 @@ class DistilBertConfig(PretrainedConfig):
    def __init__(self,
                 vocab_size_or_config_json_file=30522,
                 max_position_embeddings=512,
-                 sinusoidal_pos_embds=True,
+                 sinusoidal_pos_embds=False,
                 n_layers=6,
                 n_heads=12,
                 dim=768,
--- a/pytorch_transformers/configuration_gpt2.py
+++ b/pytorch_transformers/configuration_gpt2.py
@@ -28,7 +28,8 @@ logger = logging.getLogger(__name__)
 GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
-                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
+                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
                                      "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",}
 class GPT2Config(PretrainedConfig):
    """Configuration class to store the configuration of a `GPT2Model`.
--- a/pytorch_transformers/configuration_openai.py
+++ b/pytorch_transformers/configuration_openai.py
@@ -36,7 +36,6 @@ class OpenAIGPTConfig(PretrainedConfig):
    Args:
        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
        n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
        n_positions: Number of positional embeddings.
        n_ctx: Size of the causal mask (usually same as n_positions).
        n_embd: Dimensionality of the embeddings and hidden states.
--- a/pytorch_transformers/configuration_roberta.py
+++ b/pytorch_transformers/configuration_roberta.py
--- a/pytorch_transformers/configuration_transfo_xl.py
+++ b/pytorch_transformers/configuration_transfo_xl.py
@@ -95,10 +95,43 @@ class TransfoXLConfig(PretrainedConfig):
                 init_range=0.01,
                 proj_init_std=0.01,
                 init_std=0.02,
                 layer_norm_epsilon=1e-5,
                 **kwargs):
        """Constructs TransfoXLConfig.
        """
        super(TransfoXLConfig, self).__init__(**kwargs)
        self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
        self.cutoffs = []
        self.cutoffs.extend(cutoffs)
        self.tie_weight = tie_weight
        if proj_share_all_but_first:
            self.tie_projs = [False] + [True] * len(self.cutoffs)
        else:
            self.tie_projs = [False] + [False] * len(self.cutoffs)
        self.d_model = d_model
        self.d_embed = d_embed
        self.d_head = d_head
        self.d_inner = d_inner
        self.div_val = div_val
        self.pre_lnorm = pre_lnorm
        self.n_layer = n_layer
        self.n_head = n_head
        self.tgt_len = tgt_len
        self.ext_len = ext_len
        self.mem_len = mem_len
        self.same_length = same_length
        self.attn_type = attn_type
        self.clamp_len = clamp_len
        self.sample_softmax = sample_softmax
        self.adaptive = adaptive
        self.dropout = dropout
        self.dropatt = dropatt
        self.untie_r = untie_r
        self.init = init
        self.init_range = init_range
        self.proj_init_std = proj_init_std
        self.init_std = init_std
        self.layer_norm_epsilon = layer_norm_epsilon
        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                        and isinstance(vocab_size_or_config_json_file, unicode)):
@@ -106,39 +139,7 @@ class TransfoXLConfig(PretrainedConfig):
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
+        elif not isinstance(vocab_size_or_config_json_file, int):
            self.n_token = vocab_size_or_config_json_file
            self.cutoffs = []
            self.cutoffs.extend(cutoffs)
            self.tie_weight = tie_weight
            if proj_share_all_but_first:
                self.tie_projs = [False] + [True] * len(self.cutoffs)
            else:
                self.tie_projs = [False] + [False] * len(self.cutoffs)
            self.d_model = d_model
            self.d_embed = d_embed
            self.d_head = d_head
            self.d_inner = d_inner
            self.div_val = div_val
            self.pre_lnorm = pre_lnorm
            self.n_layer = n_layer
            self.n_head = n_head
            self.tgt_len = tgt_len
            self.ext_len = ext_len
            self.mem_len = mem_len
            self.same_length = same_length
            self.attn_type = attn_type
            self.clamp_len = clamp_len
            self.sample_softmax = sample_softmax
            self.adaptive = adaptive
            self.dropout = dropout
            self.dropatt = dropatt
            self.untie_r = untie_r
            self.init = init
            self.init_range = init_range
            self.proj_init_std = proj_init_std
            self.init_std = init_std
        else:
            raise ValueError("First argument must be either a vocabulary size (int)"
                             " or the path to a pretrained model config file (str)")
--- a/pytorch_transformers/configuration_utils.py
+++ b/pytorch_transformers/configuration_utils.py
@@ -54,11 +54,12 @@ class PretrainedConfig(object):
        self.output_attentions = kwargs.pop('output_attentions', False)
        self.output_hidden_states = kwargs.pop('output_hidden_states', False)
        self.torchscript = kwargs.pop('torchscript', False)
        self.use_bfloat16 = kwargs.pop('use_bfloat16', False)
        self.pruned_heads = kwargs.pop('pruned_heads', {})
    def save_pretrained(self, save_directory):
        """ Save a configuration object to the directory `save_directory`, so that it
-            can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
+            can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
        """
        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
@@ -66,16 +67,17 @@ class PretrainedConfig(object):
        output_config_file = os.path.join(save_directory, CONFIG_NAME)
        self.to_json_file(output_config_file)
        logger.info("Configuration saved in {}".format(output_config_file))
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
+        r""" Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
        Parameters:
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
            cache_dir: (`optional`) string:
@@ -174,7 +176,7 @@ class PretrainedConfig(object):
        """Constructs a `Config` from a Python dictionary of parameters."""
        config = cls(vocab_size_or_config_json_file=-1)
        for key, value in json_object.items():
-            config.__dict__[key] = value
+            setattr(config, key, value)
        return config
    @classmethod
--- a/pytorch_transformers/configuration_xlm.py
+++ b/pytorch_transformers/configuration_xlm.py
@@ -56,8 +56,6 @@ class XLMConfig(PretrainedConfig):
        dropout: The dropout probabilitiy for all fully connected
            layers in the embeddings, encoder, and pooler.
        dropatt: The dropout ratio for the attention
            probabilities.
        max_position_embeddings: The maximum sequence length that this model might
            ever be used with. Typically set this to something large just in case
            (e.g., 512 or 1024 or 2048).
@@ -66,7 +64,6 @@ class XLMConfig(PretrainedConfig):
        layer_norm_eps: The epsilon used by LayerNorm.
        dropout: float, dropout rate.
        dropatt: float, dropout rate on attention probabilities.
        init: str, the initialization scheme, either "normal" or "uniform".
        init_range: float, initialize the parameters with a uniform distribution
            in [-init_range, init_range]. Only effective when init="uniform".
--- a/pytorch_transformers/configuration_xlnet.py
+++ b/pytorch_transformers/configuration_xlnet.py
@@ -49,14 +49,11 @@ class XLNetConfig(PretrainedConfig):
        dropout: The dropout probabilitiy for all fully connected
            layers in the embeddings, encoder, and pooler.
        dropatt: The dropout ratio for the attention
            probabilities.
        initializer_range: The sttdev of the truncated_normal_initializer for
            initializing all weight matrices.
        layer_norm_eps: The epsilon used by LayerNorm.
        dropout: float, dropout rate.
        dropatt: float, dropout rate on attention probabilities.
        init: str, the initialization scheme, either "normal" or "uniform".
        init_range: float, initialize the parameters with a uniform distribution
            in [-init_range, init_range]. Only effective when init="uniform".
@@ -80,6 +77,7 @@ class XLNetConfig(PretrainedConfig):
                 n_layer=24,
                 n_head=16,
                 d_inner=4096,
                 max_position_embeddings=512,
                 ff_activation="gelu",
                 untie_r=True,
                 attn_type="bi",
@@ -112,7 +110,7 @@ class XLNetConfig(PretrainedConfig):
            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
-                self.__dict__[key] = value
+                setattr(config, key, value)
        elif isinstance(vocab_size_or_config_json_file, int):
            self.n_token = vocab_size_or_config_json_file
            self.d_model = d_model
--- a/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import argparse
 import torch
-from pytorch_transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
 import logging
 logging.basicConfig(level=logging.INFO)
--- a/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
+++ b/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
@@ -20,7 +20,7 @@ import argparse
 import torch
 import numpy as np
 import tensorflow as tf
-from pytorch_transformers import BertModel
+from transformers import BertModel
 def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
--- a/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from io import open
 import torch
-from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
+from transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                     GPT2Config,
                                                     GPT2Model,
                                                     load_tf_weights_in_gpt2)
--- a/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from io import open
 import torch
-from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
+from transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                     OpenAIGPTConfig,
                                                     OpenAIGPTModel,
                                                     load_tf_weights_in_openai_gpt)
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -0,0 +1,234 @@
 # coding=utf-8
 # Copyright 2018 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Convert pytorch checkpoints to TensorFlow """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import os
 import argparse
 import tensorflow as tf
 from transformers import is_torch_available, cached_path
 from transformers import (BertConfig, TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, load_bert_pt_weights_in_tf2, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  TransfoXLConfig, TFTransfoXLLMHeadModel, load_transfo_xl_pt_weights_in_tf2, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, load_openai_gpt_pt_weights_in_tf2, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, load_roberta_pt_weights_in_tf2, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, load_distilbert_pt_weights_in_tf2, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
 if is_torch_available():
    import torch
    import numpy as np
    from transformers import (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
 else:
    (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
    GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
    XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
    XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
    TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
    OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
    RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
    DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,) = (
        None, None, None, None,
        None, None,
        None, None,
        None, None,
        None, None,
        None, None,
        None, None, None,
        None, None, None,)
 import logging
 logging.basicConfig(level=logging.INFO)
 MODEL_CLASSES = {
    'bert': (BertConfig, TFBertForPreTraining, load_bert_pt_weights_in_tf2, BertForPreTraining, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'bert-large-uncased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, load_bert_pt_weights_in_tf2, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'bert-large-cased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, load_bert_pt_weights_in_tf2, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'bert-base-cased-finetuned-mrpc': (BertConfig, TFBertForSequenceClassification, load_bert_pt_weights_in_tf2, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'gpt2': (GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'xlnet': (XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'xlm': (XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'transfo-xl': (TransfoXLConfig, TFTransfoXLLMHeadModel, load_transfo_xl_pt_weights_in_tf2, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'openai-gpt': (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, load_openai_gpt_pt_weights_in_tf2, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'roberta': (RobertaConfig, TFRobertaForMaskedLM, load_roberta_pt_weights_in_tf2, RobertaForMaskedLM, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, load_roberta_pt_weights_in_tf2, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, load_distilbert_pt_weights_in_tf2, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, load_distilbert_pt_weights_in_tf2, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
 }
 def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
    if model_type not in MODEL_CLASSES:
        raise ValueError("Unrecognized model type, should be one of {}.".format(list(MODEL_CLASSES.keys())))
    config_class, model_class, loading_fct, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
    # Initialise TF model
    if config_file in aws_config_map:
        config_file = cached_path(aws_config_map[config_file], force_download=not use_cached_models)
    config = config_class.from_json_file(config_file)
    config.output_hidden_states = True
    config.output_attentions = True
    print("Building TensorFlow model from configuration: {}".format(str(config)))
    tf_model = model_class(config)
    # Load weights from tf checkpoint
    if pytorch_checkpoint_path in aws_model_maps:
        pytorch_checkpoint_path = cached_path(aws_model_maps[pytorch_checkpoint_path], force_download=not use_cached_models)
    tf_model = loading_fct(tf_model, pytorch_checkpoint_path)
    if compare_with_pt_model:
        inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
        tf_inputs = tf.constant(inputs_list)
        tfo = tf_model(tf_inputs, training=False)  # build the network
        pt_model = pt_model_class.from_pretrained(None,
                                                  config=config,
                                                  state_dict=torch.load(pytorch_checkpoint_path,
                                                                        map_location='cpu'))
        pt_inputs = torch.tensor(inputs_list)
        with torch.no_grad():
            pto = pt_model(pt_inputs)
        np_pt = pto[0].detach().numpy()
        np_tf = tfo[0].numpy()
        diff = np.amax(np.abs(np_pt - np_tf))
        print("Max absolute difference between models outputs {}".format(diff))
        assert diff <= 2e-2, "Error, model absolute difference is >2e-2"
    # Save pytorch-model
    print("Save TensorFlow model to {}".format(tf_dump_path))
    tf_model.save_weights(tf_dump_path, save_format='h5')
 def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
                                     compare_with_pt_model=False, use_cached_models=False, only_convert_finetuned_models=False):
    assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
    if args_model_type is None:
        model_types = list(MODEL_CLASSES.keys())
    else:
        model_types = [args_model_type]
    for j, model_type in enumerate(model_types, start=1):
        print("=" * 100)
        print(" Converting model type {}/{}: {}".format(j, len(model_types), model_type))
        print("=" * 100)
        if model_type not in MODEL_CLASSES:
            raise ValueError("Unrecognized model type {}, should be one of {}.".format(model_type, list(MODEL_CLASSES.keys())))
        config_class, model_class, loading_fct, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
        if model_shortcut_names_or_path is None:
            model_shortcut_names_or_path = list(aws_model_maps.keys())
        if config_shortcut_names_or_path is None:
            config_shortcut_names_or_path = model_shortcut_names_or_path
        for i, (model_shortcut_name, config_shortcut_name) in enumerate(
                zip(model_shortcut_names_or_path, config_shortcut_names_or_path), start=1):
            print("-" * 100)
            if '-squad' in model_shortcut_name or '-mrpc' in model_shortcut_name or '-mnli' in model_shortcut_name:
                if not only_convert_finetuned_models:
                    print("    Skipping finetuned checkpoint {}".format(model_shortcut_name))
                    continue
                model_type = model_shortcut_name
            elif only_convert_finetuned_models:
                print("    Skipping not finetuned checkpoint {}".format(model_shortcut_name))
                continue
            print("    Converting checkpoint {}/{}: {} - model_type {}".format(i, len(aws_config_map), model_shortcut_name, model_type))
            print("-" * 100)
            if config_shortcut_name in aws_config_map:
                config_file = cached_path(aws_config_map[config_shortcut_name], force_download=not use_cached_models)
            else:
                config_file = cached_path(config_shortcut_name, force_download=not use_cached_models)
            if model_shortcut_name in aws_model_maps:
                model_file = cached_path(aws_model_maps[model_shortcut_name], force_download=not use_cached_models)
            else:
                model_file = cached_path(model_shortcut_name, force_download=not use_cached_models)
            convert_pt_checkpoint_to_tf(model_type,
                                        model_file,
                                        config_file,
                                        os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
                                        compare_with_pt_model=compare_with_pt_model)
            os.remove(config_file)
            os.remove(model_file)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    ## Required parameters
    parser.add_argument("--tf_dump_path",
                        default = None,
                        type = str,
                        required = True,
                        help = "Path to the output Tensorflow dump file.")
    parser.add_argument("--model_type",
                        default = None,
                        type = str,
                        help = "Model type selected in the list of {}. If not given, will download and convert all the models from AWS.".format(list(MODEL_CLASSES.keys())))
    parser.add_argument("--pytorch_checkpoint_path",
                        default = None,
                        type = str,
                        help = "Path to the PyTorch checkpoint path or shortcut name to download from AWS. "
                               "If not given, will download and convert all the checkpoints from AWS.")
    parser.add_argument("--config_file",
                        default = None,
                        type = str,
                        help = "The config json file corresponding to the pre-trained model. \n"
                               "This specifies the model architecture. If not given and "
                               "--pytorch_checkpoint_path is not given or is a shortcut name"
                               "use the configuration associated to the shortcut name on the AWS")
    parser.add_argument("--compare_with_pt_model",
                        action='store_true',
                        help = "Compare Tensorflow and PyTorch model predictions.")
    parser.add_argument("--use_cached_models",
                        action='store_true',
                        help = "Use cached models if possible instead of updating to latest checkpoint versions.")
    parser.add_argument("--only_convert_finetuned_models",
                        action='store_true',
                        help = "Only convert finetuned models.")
    args = parser.parse_args()
    # if args.pytorch_checkpoint_path is not None:
    #     convert_pt_checkpoint_to_tf(args.model_type.lower(),
    #                                 args.pytorch_checkpoint_path,
    #                                 args.config_file if args.config_file is not None else args.pytorch_checkpoint_path,
    #                                 args.tf_dump_path,
    #                                 compare_with_pt_model=args.compare_with_pt_model,
    #                                 use_cached_models=args.use_cached_models)
    # else:
    convert_all_pt_checkpoints_to_tf(args.model_type.lower() if args.model_type is not None else None,
                                        args.tf_dump_path,
                                        model_shortcut_names_or_path=[args.pytorch_checkpoint_path] if args.pytorch_checkpoint_path is not None else None,
                                        config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
                                        compare_with_pt_model=args.compare_with_pt_model,
                                        use_cached_models=args.use_cached_models,
                                        only_convert_finetuned_models=args.only_convert_finetuned_models)
--- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -23,12 +23,12 @@ import torch
 from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 from fairseq.modules import TransformerSentenceEncoderLayer
-from pytorch_transformers import (BertConfig, BertEncoder,
+from transformers import (BertConfig, BertEncoder,
                                                BertIntermediate, BertLayer,
                                                BertModel, BertOutput,
                                                BertSelfAttention,
                                                BertSelfOutput)
-from pytorch_transformers import (RobertaEmbeddings,
+from transformers import (RobertaEmbeddings,
                                                   RobertaForMaskedLM,
                                                   RobertaForSequenceClassification,
                                                   RobertaModel)
--- a/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
@@ -23,12 +23,12 @@ from io import open
 import torch
-import pytorch_transformers.tokenization_transfo_xl as data_utils
+import transformers.tokenization_transfo_xl as data_utils
-from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
+from transformers import CONFIG_NAME, WEIGHTS_NAME
-from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
+from transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
                                                      load_tf_weights_in_transfo_xl)
-from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
+from transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
 if sys.version_info[0] == 2:
    import cPickle as pickle
--- a/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
@@ -23,8 +23,8 @@ from io import open
 import torch
 import numpy
-from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
+from transformers import CONFIG_NAME, WEIGHTS_NAME
-from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
+from transformers.tokenization_xlm import VOCAB_FILES_NAMES
 import logging
 logging.basicConfig(level=logging.INFO)
@@ -33,7 +33,15 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
    # Load checkpoint
    chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
-    model = chkpt['model']
+    state_dict = chkpt['model']
    # We have the base model one level deeper than the original XLM repository
    two_levels_state_dict = {}
    for k, v in state_dict.items():
        if 'pred_layer' in k:
            two_levels_state_dict[k] = v
        else:
            two_levels_state_dict['transformer.' + k] = v
    config = chkpt['params']
    config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
@@ -47,7 +55,7 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
    pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' +  VOCAB_FILES_NAMES['vocab_file']
    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
-    torch.save(model, pytorch_weights_dump_path)
+    torch.save(two_levels_state_dict, pytorch_weights_dump_path)
    print("Save configuration file to {}".format(pytorch_config_dump_path))
    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
--- a/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
@@ -22,7 +22,7 @@ import os
 import argparse
 import torch
-from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
+from transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                    XLNetConfig,
                                                    XLNetLMHeadModel, XLNetForQuestionAnswering,
                                                    XLNetForSequenceClassification,
--- a/transformers/data/init.py
+++ b/transformers/data/init.py
@@ -0,0 +1,6 @@
 from .processors import InputExample, InputFeatures, DataProcessor
 from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
 from .metrics import is_sklearn_available
 if is_sklearn_available():
    from .metrics import glue_compute_metrics
--- a/transformers/data/metrics/init.py
+++ b/transformers/data/metrics/init.py
@@ -0,0 +1,83 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import csv
 import sys
 import logging
 logger = logging.getLogger(__name__)
 try:
    from scipy.stats import pearsonr, spearmanr
    from sklearn.metrics import matthews_corrcoef, f1_score
    _has_sklearn = True
 except (AttributeError, ImportError) as e:
    logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html")
    _has_sklearn = False
 def is_sklearn_available():
    return _has_sklearn
 if _has_sklearn:
    def simple_accuracy(preds, labels):
        return (preds == labels).mean()
    def acc_and_f1(preds, labels):
        acc = simple_accuracy(preds, labels)
        f1 = f1_score(y_true=labels, y_pred=preds)
        return {
            "acc": acc,
            "f1": f1,
            "acc_and_f1": (acc + f1) / 2,
        }
    def pearson_and_spearman(preds, labels):
        pearson_corr = pearsonr(preds, labels)[0]
        spearman_corr = spearmanr(preds, labels)[0]
        return {
            "pearson": pearson_corr,
            "spearmanr": spearman_corr,
            "corr": (pearson_corr + spearman_corr) / 2,
        }
    def glue_compute_metrics(task_name, preds, labels):
        assert len(preds) == len(labels)
        if task_name == "cola":
            return {"mcc": matthews_corrcoef(labels, preds)}
        elif task_name == "sst-2":
            return {"acc": simple_accuracy(preds, labels)}
        elif task_name == "mrpc":
            return acc_and_f1(preds, labels)
        elif task_name == "sts-b":
            return pearson_and_spearman(preds, labels)
        elif task_name == "qqp":
            return acc_and_f1(preds, labels)
        elif task_name == "mnli":
            return {"acc": simple_accuracy(preds, labels)}
        elif task_name == "mnli-mm":
            return {"acc": simple_accuracy(preds, labels)}
        elif task_name == "qnli":
            return {"acc": simple_accuracy(preds, labels)}
        elif task_name == "rte":
            return {"acc": simple_accuracy(preds, labels)}
        elif task_name == "wnli":
            return {"acc": simple_accuracy(preds, labels)}
        else:
            raise KeyError(task_name)
--- a/transformers/data/processors/init.py
+++ b/transformers/data/processors/init.py
@@ -0,0 +1,3 @@
 from .utils import InputExample, InputFeatures, DataProcessor
 from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
--- a/transformers/data/processors/glue.py
+++ b/transformers/data/processors/glue.py
@@ -13,84 +13,154 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BERT classification fine-tuning: utilities to work with GLUE tasks """
+""" GLUE processors and helpers """
 from __future__ import absolute_import, division, print_function
 import csv
 import logging
 import os
 import sys
 from io import open
-from scipy.stats import pearsonr, spearmanr
+from .utils import DataProcessor, InputExample, InputFeatures
-from sklearn.metrics import matthews_corrcoef, f1_score
+from ...file_utils import is_tf_available
 if is_tf_available():
    import tensorflow as tf
 logger = logging.getLogger(__name__)
-class InputExample(object):
+def glue_convert_examples_to_features(examples, tokenizer,
-    """A single training/test example for simple sequence classification."""
+                                      max_length=512,
                                      task=None,
                                      label_list=None,
                                      output_mode=None,
                                      pad_on_left=False,
                                      pad_token=0,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):
    """
    Loads a data file into a list of ``InputFeatures``
-    def __init__(self, guid, text_a, text_b=None, label=None):
+    Args:
-        """Constructs a InputExample.
+        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)
-        Args:
+    Returns:
-            guid: Unique id for the example.
+        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
-            text_a: string. The untokenized text of the first sequence. For single
+        containing the task-specific features. If the input is a list of ``InputExamples``, will return
-            sequence tasks, only this sequence must be specified.
+        a list of task-specific ``InputFeatures`` which can be fed to the model.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label
    """
    is_tf_dataset = False
    if is_tf_available() and isinstance(examples, tf.data.Dataset):
        is_tf_dataset = True
-class InputFeatures(object):
+    if task is not None:
-    """A single set of features of data."""
+        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" % (output_mode, task))
-    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+    label_map = {label: i for i, label in enumerate(label_list)}
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % (ex_index))
        if is_tf_dataset:
            example = processor.get_example_from_tensor_dict(example)
-class DataProcessor(object):
+        inputs = tokenizer.encode_plus(
-    """Base class for data converters for sequence classification data sets."""
+            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
            truncate_first_sequence=True  # We're truncating the first sequence in priority
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
-    def get_train_examples(self, data_dir):
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        """Gets a collection of `InputExample`s for the train set."""
+        # tokens are attended to.
-        raise NotImplementedError()
+        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-    def get_dev_examples(self, data_dir):
+        # Zero-pad up to the sequence length.
-        """Gets a collection of `InputExample`s for the dev set."""
+        padding_length = max_length - len(input_ids)
-        raise NotImplementedError()
+        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
-    def get_labels(self):
+        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
-        """Gets the list of labels for this data set."""
+        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
-        raise NotImplementedError()
+        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
-    @classmethod
+        if output_mode == "classification":
-    def _read_tsv(cls, input_file, quotechar=None):
+            label = label_map[example.label]
-        """Reads a tab separated value file."""
+        elif output_mode == "regression":
-        with open(input_file, "r", encoding="utf-8-sig") as f:
+            label = float(example.label)
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+        else:
-            lines = []
+            raise KeyError(output_mode)
-            for line in reader:
+
-                if sys.version_info[0] == 2:
+        if ex_index < 5:
-                    line = list(unicode(cell, 'utf-8') for cell in line)
+            logger.info("*** Example ***")
-                lines.append(line)
+            logger.info("guid: %s" % (example.guid))
-            return lines
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))
        features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids,
                              label=label))
    if is_tf_available() and is_tf_dataset:
        def gen():
            for ex in features:
                yield  ({'input_ids': ex.input_ids,
                         'attention_mask': ex.attention_mask,
                         'token_type_ids': ex.token_type_ids},
                        ex.label)
        return tf.data.Dataset.from_generator(gen,
            ({'input_ids': tf.int32,
              'attention_mask': tf.int32,
              'token_type_ids': tf.int32},
             tf.int64),
            ({'input_ids': tf.TensorShape([None]),
              'attention_mask': tf.TensorShape([None]),
              'token_type_ids': tf.TensorShape([None])},
             tf.TensorShape([])))
    return features
 class MrpcProcessor(DataProcessor):
    """Processor for the MRPC data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['sentence1'].numpy().decode('utf-8'),
                            tensor_dict['sentence2'].numpy().decode('utf-8'),
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
@@ -124,6 +194,13 @@ class MrpcProcessor(DataProcessor):
 class MnliProcessor(DataProcessor):
    """Processor for the MultiNLI data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['premise'].numpy().decode('utf-8'),
                            tensor_dict['hypothesis'].numpy().decode('utf-8'),
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -167,6 +244,13 @@ class MnliMismatchedProcessor(MnliProcessor):
 class ColaProcessor(DataProcessor):
    """Processor for the CoLA data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['sentence'].numpy().decode('utf-8'),
                            None,
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -196,6 +280,13 @@ class ColaProcessor(DataProcessor):
 class Sst2Processor(DataProcessor):
    """Processor for the SST-2 data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['sentence'].numpy().decode('utf-8'),
                            None,
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -227,6 +318,13 @@ class Sst2Processor(DataProcessor):
 class StsbProcessor(DataProcessor):
    """Processor for the STS-B data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['sentence1'].numpy().decode('utf-8'),
                            tensor_dict['sentence2'].numpy().decode('utf-8'),
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -259,6 +357,13 @@ class StsbProcessor(DataProcessor):
 class QqpProcessor(DataProcessor):
    """Processor for the QQP data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['question1'].numpy().decode('utf-8'),
                            tensor_dict['question2'].numpy().decode('utf-8'),
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -294,6 +399,13 @@ class QqpProcessor(DataProcessor):
 class QnliProcessor(DataProcessor):
    """Processor for the QNLI data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['question'].numpy().decode('utf-8'),
                            tensor_dict['sentence'].numpy().decode('utf-8'),
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -302,7 +414,7 @@ class QnliProcessor(DataProcessor):
    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), 
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")),
            "dev_matched")
    def get_labels(self):
@@ -327,6 +439,13 @@ class QnliProcessor(DataProcessor):
 class RteProcessor(DataProcessor):
    """Processor for the RTE data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['sentence1'].numpy().decode('utf-8'),
                            tensor_dict['sentence2'].numpy().decode('utf-8'),
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -359,6 +478,13 @@ class RteProcessor(DataProcessor):
 class WnliProcessor(DataProcessor):
    """Processor for the WNLI data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['sentence1'].numpy().decode('utf-8'),
                            tensor_dict['sentence2'].numpy().decode('utf-8'),
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -387,198 +513,19 @@ class WnliProcessor(DataProcessor):
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples
 glue_tasks_num_labels = {
    "cola": 2,
    "mnli": 3,
    "mrpc": 2,
    "sst-2": 2,
    "sts-b": 1,
    "qqp": 2,
    "qnli": 2,
    "rte": 2,
    "wnli": 2,
 }
-def convert_examples_to_features(examples, label_list, max_seq_length,
+glue_processors = {
                                 tokenizer, output_mode,
                                 cls_token_at_end=False,
                                 cls_token='[CLS]',
                                 cls_token_segment_id=1,
                                 sep_token='[SEP]',
                                 sep_token_extra=False,
                                 pad_on_left=False,
                                 pad_token=0,
                                 pad_token_segment_id=0,
                                 sequence_a_segment_id=0, 
                                 sequence_b_segment_id=1,
                                 mask_padding_with_zero=True):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """
    label_map = {label : i for i, label in enumerate(label_list)}
    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
        tokens_a = tokenizer.tokenize(example.text_a)
        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
            special_tokens_count = 4 if sep_token_extra else 3
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
        else:
            # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
            special_tokens_count = 3 if sep_token_extra else 2
            if len(tokens_a) > max_seq_length - special_tokens_count:
                tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = tokens_a + [sep_token]
        if sep_token_extra:
            # roberta uses an extra separator b/w pairs of sentences
            tokens += [sep_token]
        segment_ids = [sequence_a_segment_id] * len(tokens)
        if tokens_b:
            tokens += tokens_b + [sep_token]
            segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
        if cls_token_at_end:
            tokens = tokens + [cls_token]
            segment_ids = segment_ids + [cls_token_segment_id]
        else:
            tokens = [cls_token] + tokens
            segment_ids = [cls_token_segment_id] + segment_ids
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        if output_mode == "classification":
            label_id = label_map[example.label]
        elif output_mode == "regression":
            label_id = float(example.label)
        else:
            raise KeyError(output_mode)
        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_id))
        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features
 def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""
    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()
 def simple_accuracy(preds, labels):
    return (preds == labels).mean()
 def acc_and_f1(preds, labels):
    acc = simple_accuracy(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds)
    return {
        "acc": acc,
        "f1": f1,
        "acc_and_f1": (acc + f1) / 2,
    }
 def pearson_and_spearman(preds, labels):
    pearson_corr = pearsonr(preds, labels)[0]
    spearman_corr = spearmanr(preds, labels)[0]
    return {
        "pearson": pearson_corr,
        "spearmanr": spearman_corr,
        "corr": (pearson_corr + spearman_corr) / 2,
    }
 def compute_metrics(task_name, preds, labels):
    assert len(preds) == len(labels)
    if task_name == "cola":
        return {"mcc": matthews_corrcoef(labels, preds)}
    elif task_name == "sst-2":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "mrpc":
        return acc_and_f1(preds, labels)
    elif task_name == "sts-b":
        return pearson_and_spearman(preds, labels)
    elif task_name == "qqp":
        return acc_and_f1(preds, labels)
    elif task_name == "mnli":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "mnli-mm":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "qnli":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "rte":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "wnli":
        return {"acc": simple_accuracy(preds, labels)}
    else:
        raise KeyError(task_name)
 processors = {
    "cola": ColaProcessor,
    "mnli": MnliProcessor,
    "mnli-mm": MnliMismatchedProcessor,
@@ -591,7 +538,7 @@ processors = {
    "wnli": WnliProcessor,
 }
-output_modes = {
+glue_output_modes = {
    "cola": "classification",
    "mnli": "classification",
    "mnli-mm": "classification",
@@ -603,15 +550,3 @@ output_modes = {
    "rte": "classification",
    "wnli": "classification",
 }
 GLUE_TASKS_NUM_LABELS = {
    "cola": 2,
    "mnli": 3,
    "mrpc": 2,
    "sst-2": 2,
    "sts-b": 1,
    "qqp": 2,
    "qnli": 2,
    "rte": 2,
    "wnli": 2,
 }
--- a/transformers/data/processors/utils.py
+++ b/transformers/data/processors/utils.py
@@ -0,0 +1,120 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import csv
 import sys
 import copy
 import json
 class InputExample(object):
    """
    A single training/test example for simple sequence classification.
    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label
    def __repr__(self):
        return str(self.to_json_string())
    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output
    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 class InputFeatures(object):
    """
    A single set of features of data.
    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
        label: Label corresponding to the input
    """
    def __init__(self, input_ids, attention_mask, token_type_ids, label):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.label = label
    def __repr__(self):
        return str(self.to_json_string())
    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output
    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """Gets an example from a dict with tensorflow tensors
        Args:
            tensor_dict: Keys and values should match the corresponding Glue
                tensorflow_dataset examples.
        """
        raise NotImplementedError()
    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()
    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()
    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()
    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8-sig") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines
--- a/pytorch_transformers/file_utils.py
+++ b/pytorch_transformers/file_utils.py
@@ -23,6 +23,24 @@ from botocore.exceptions import ClientError
 import requests
 from tqdm import tqdm
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 try:
    import tensorflow as tf
    assert int(tf.__version__[0]) >= 2
    _tf_available = True  # pylint: disable=invalid-name
    logger.info("TensorFlow version {} available.".format(tf.__version__))
 except (ImportError, AssertionError):
    _tf_available = False  # pylint: disable=invalid-name
 try:
    import torch
    _torch_available = True  # pylint: disable=invalid-name
    logger.info("PyTorch version {} available.".format(torch.__version__))
 except ImportError:
    _torch_available = False  # pylint: disable=invalid-name
 try:
    from torch.hub import _get_torch_home
    torch_cache_home = _get_torch_home()
@@ -30,7 +48,7 @@ except ImportError:
    torch_cache_home = os.path.expanduser(
        os.getenv('TORCH_HOME', os.path.join(
            os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
-default_cache_path = os.path.join(torch_cache_home, 'pytorch_transformers')
+default_cache_path = os.path.join(torch_cache_home, 'transformers')
 try:
    from urllib.parse import urlparse
@@ -47,12 +65,18 @@ except (AttributeError, ImportError):
                                                        default_cache_path))
 PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 WEIGHTS_NAME = "pytorch_model.bin"
 TF2_WEIGHTS_NAME = 'tf_model.h5'
 TF_WEIGHTS_NAME = 'model.ckpt'
 CONFIG_NAME = "config.json"
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+def is_torch_available():
    return _torch_available
 def is_tf_available():
    return _tf_available
 if not six.PY2:
    def add_start_docstrings(*docstr):
@@ -83,6 +107,9 @@ def url_to_filename(url, etag=None):
    Convert `url` into a hashed filename in a repeatable way.
    If `etag` is specified, append its hash to the url's, delimited
    by a period.
    If the url ends with .h5 (Keras HDF5 weights) ands '.h5' to the name
    so that TF 2.0 can identify it as a HDF5 file
    (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
    """
    url_bytes = url.encode('utf-8')
    url_hash = sha256(url_bytes)
@@ -93,6 +120,9 @@ def url_to_filename(url, etag=None):
        etag_hash = sha256(etag_bytes)
        filename += '.' + etag_hash.hexdigest()
    if url.endswith('.h5'):
        filename += '.h5'
    return filename
@@ -102,7 +132,7 @@ def filename_to_url(filename, cache_dir=None):
    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
    """
    if cache_dir is None:
-        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+        cache_dir = TRANSFORMERS_CACHE
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)
@@ -133,7 +163,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
        force_download: if True, re-dowload the file even if it's already cached in the cache dir.
    """
    if cache_dir is None:
-        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+        cache_dir = TRANSFORMERS_CACHE
    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
        url_or_filename = str(url_or_filename)
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
@@ -222,7 +252,7 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
    If it's not there, download it. Then return the path to the cached file.
    """
    if cache_dir is None:
-        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+        cache_dir = TRANSFORMERS_CACHE
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)
    if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -36,7 +36,7 @@ logger = logging.getLogger(__name__)
 class AutoModel(object):
    r"""
-        :class:`~pytorch_transformers.AutoModel` is a generic model class
+        :class:`~transformers.AutoModel` is a generic model class
        that will be instantiated as one of the base model classes of the library
        when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
        class method.
@@ -84,23 +84,23 @@ class AutoModel(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
@@ -120,7 +120,7 @@ class AutoModel(object):
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
@@ -157,7 +157,7 @@ class AutoModel(object):
 class AutoModelWithLMHead(object):
    r"""
-        :class:`~pytorch_transformers.AutoModelWithLMHead` is a generic model class
+        :class:`~transformers.AutoModelWithLMHead` is a generic model class
        that will be instantiated as one of the language modeling model classes of the library
        when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
        class method.
@@ -208,23 +208,23 @@ class AutoModelWithLMHead(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
@@ -244,7 +244,7 @@ class AutoModelWithLMHead(object):
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
@@ -281,7 +281,7 @@ class AutoModelWithLMHead(object):
 class AutoModelForSequenceClassification(object):
    r"""
-        :class:`~pytorch_transformers.AutoModelForSequenceClassification` is a generic model class
+        :class:`~transformers.AutoModelForSequenceClassification` is a generic model class
        that will be instantiated as one of the sequence classification model classes of the library
        when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
        class method.
@@ -326,23 +326,23 @@ class AutoModelForSequenceClassification(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
@@ -362,7 +362,7 @@ class AutoModelForSequenceClassification(object):
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
@@ -392,7 +392,7 @@ class AutoModelForSequenceClassification(object):
 class AutoModelForQuestionAnswering(object):
    r"""
-        :class:`~pytorch_transformers.AutoModelForQuestionAnswering` is a generic model class
+        :class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
        that will be instantiated as one of the question answering model classes of the library
        when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
        class method.
@@ -435,23 +435,23 @@ class AutoModelForQuestionAnswering(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
@@ -471,7 +471,7 @@ class AutoModelForQuestionAnswering(object):
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -118,26 +118,27 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
 def gelu(x):
-    """Implementation of the gelu activation function.
+    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
 def gelu_new(x):
    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
        Also see https://arxiv.org/abs/1606.08415
    """
    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 def swish(x):
    return x * torch.sigmoid(x)
-ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new}
-try:
+BertLayerNorm = torch.nn.LayerNorm
    from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
 except (ImportError, AttributeError) as e:
    logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
    BertLayerNorm = torch.nn.LayerNorm
 class BertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings.
@@ -195,7 +196,7 @@ class BertSelfAttention(nn.Module):
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)
-    def forward(self, hidden_states, attention_mask, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)
@@ -207,8 +208,9 @@ class BertSelfAttention(nn.Module):
        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        if attention_mask is not None:
-        attention_scores = attention_scores + attention_mask
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
            attention_scores = attention_scores + attention_mask
        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
@@ -275,7 +277,7 @@ class BertAttention(nn.Module):
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)
-    def forward(self, input_tensor, attention_mask, head_mask=None):
+    def forward(self, input_tensor, attention_mask=None, head_mask=None):
        self_outputs = self.self(input_tensor, attention_mask, head_mask)
        attention_output = self.output(self_outputs[0], input_tensor)
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
@@ -318,7 +320,7 @@ class BertLayer(nn.Module):
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)
-    def forward(self, hidden_states, attention_mask, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
        attention_output = attention_outputs[0]
        intermediate_output = self.intermediate(attention_output)
@@ -334,7 +336,7 @@ class BertEncoder(nn.Module):
        self.output_hidden_states = config.output_hidden_states
        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
-    def forward(self, hidden_states, attention_mask, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        all_hidden_states = ()
        all_attentions = ()
        for i, layer_module in enumerate(self.layer):
@@ -480,9 +482,9 @@ BERT_START_DOCSTRING = r"""    The BERT model was proposed in
        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
-        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. 
            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 BERT_INPUTS_DOCSTRING = r"""
@@ -506,9 +508,9 @@ BERT_INPUTS_DOCSTRING = r"""
            Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
            the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
+            Indices can be obtained using :class:`transformers.BertTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
--- a/pytorch_transformers/modeling_distilbert.py
+++ b/pytorch_transformers/modeling_distilbert.py
@@ -372,9 +372,9 @@ DISTILBERT_START_DOCSTRING = r"""
        https://medium.com/huggingface/distilbert-8cf3380435b5
    Parameters:
-        config (:class:`~pytorch_transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 DISTILBERT_INPUTS_DOCSTRING = r"""
@@ -649,7 +649,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
        start_positions = torch.tensor([1])
        end_positions = torch.tensor([3])
        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss, start_scores, end_scores = outputs[:2]
+        loss, start_scores, end_scores = outputs[:3]
    """
    def __init__(self, config):
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -38,7 +38,8 @@ logger = logging.getLogger(__name__)
 GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
-                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"}
+                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin",
                                     "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",}
 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
    """ Load tf checkpoints in a pytorch model
@@ -280,9 +281,9 @@ GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
-        config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 GPT2_INPUTS_DOCSTRING = r"""    Inputs:
@@ -290,9 +291,9 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
            Indices of input sequence tokens in the vocabulary.
            GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
            the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.GPT2Tokenizer`.
+            Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **past**:
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
@@ -367,6 +368,13 @@ class GPT2Model(GPT2PreTrainedModel):
            self.h[layer].attn.prune_heads(heads)
    def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_shape[-1])
        if token_type_ids is not None:
            token_type_ids = token_type_ids.view(-1, input_shape[-1])
        if position_ids is not None:
            position_ids = position_ids.view(-1, input_shape[-1])
        if past is None:
            past_length = 0
            past = [None] * len(self.h)
@@ -378,6 +386,7 @@ class GPT2Model(GPT2PreTrainedModel):
        # Attention mask.
        if attention_mask is not None:
            attention_mask = attention_mask.view(-1, input_shape[-1])
            # We create a 3D attention mask from a 2D tensor mask.
            # Sizes are [batch_size, 1, 1, to_seq_length]
            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
@@ -407,14 +416,9 @@ class GPT2Model(GPT2PreTrainedModel):
        else:
            head_mask = [None] * self.config.n_layer
        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_ids.size(-1))
        position_ids = position_ids.view(-1, position_ids.size(-1))
        inputs_embeds = self.wte(input_ids)
        position_embeds = self.wpe(position_ids)
        if token_type_ids is not None:
            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
            token_type_embeds = self.wte(token_type_ids)
        else:
            token_type_embeds = 0
@@ -490,7 +494,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
    Examples::
        import torch
-        from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
+        from transformers import GPT2Tokenizer, GPT2LMHeadModel
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = GPT2LMHeadModel.from_pretrained('gpt2')
@@ -586,7 +590,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    Examples::
        import torch
-        from pytorch_transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
+        from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -294,9 +294,9 @@ OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
-        config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
@@ -304,9 +304,9 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
            Indices of input sequence tokens in the vocabulary.
            GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
            the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
+            Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -43,6 +43,9 @@ class RobertaEmbeddings(BertEmbeddings):
    def __init__(self, config):
        super(RobertaEmbeddings, self).__init__(config)
        self.padding_idx = 1
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size,
                                                padding_idx=self.padding_idx)
    def forward(self, input_ids, token_type_ids=None, position_ids=None):
        seq_length = input_ids.size(1)
@@ -77,9 +80,9 @@ ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
-        config (:class:`~pytorch_transformers.RobertaConfig`): Model configuration class with all the parameters of the 
+        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the 
            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 ROBERTA_INPUTS_DOCSTRING = r"""
@@ -102,8 +105,8 @@ ROBERTA_INPUTS_DOCSTRING = r"""
            RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
            the right rather than the left.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
@@ -361,9 +364,9 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
                ``token_type_ids:   0   0   0   0  0     0   0``
-            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
+            Indices can be obtained using :class:`transformers.BertTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
            Segment token indices to indicate first and second portions of the inputs.
            The second dimension of the input (`num_choices`) indicates the number of choices to score.
--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -0,0 +1,501 @@
 # coding=utf-8
 # Copyright 2018 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Auto Model class. """
 from __future__ import absolute_import, division, print_function, unicode_literals
 import logging
 from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, TFBertForQuestionAnswering
 from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel
 from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel
 from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel
 from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, TFXLNetForQuestionAnsweringSimple
 from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple
 from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification
 from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification
 from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 class TFAutoModel(object):
    r"""
        :class:`~transformers.TFAutoModel` is a generic model class
        that will be instantiated as one of the base model classes of the library
        when created with the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)`
        class method.
        The `from_pretrained()` method takes care of returning the correct model class instance
        using pattern matching on the `pretrained_model_name_or_path` string.
        The base model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: TFDistilBertModel (DistilBERT model)
            - contains `roberta`: TFRobertaModel (RoBERTa model)
            - contains `bert`: TFBertModel (Bert model)
            - contains `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
            - contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
            - contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
            - contains `xlnet`: TFXLNetModel (XLNet model)
            - contains `xlm`: TFXLMModel (XLM model)
        This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
        raise EnvironmentError("TFAutoModel is designed to be instantiated "
            "using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` method.")
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        r""" Instantiates one of the base model classes of the library
        from a pre-trained model configuration.
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: TFDistilBertModel (DistilBERT model)
            - contains `roberta`: TFRobertaModel (RoBERTa model)
            - contains `bert`: TFTFBertModel (Bert model)
            - contains `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
            - contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
            - contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
            - contains `xlnet`: TFXLNetModel (XLNet model)
            - contains `xlm`: TFXLMModel (XLM model)
        Params:
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
            from_pt: (`Optional`) Boolean
                Set to True if the Checkpoint is a PyTorch checkpoint.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
            output_loading_info: (`optional`) boolean:
                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
            kwargs: (`optional`) Remaining dictionary of keyword arguments:
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
            model = TFAutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = TFAutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = TFAutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
            model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
        """
        if 'distilbert' in pretrained_model_name_or_path:
            return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
            return TFBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'openai-gpt' in pretrained_model_name_or_path:
            return TFOpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'gpt2' in pretrained_model_name_or_path:
            return TFGPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'transfo-xl' in pretrained_model_name_or_path:
            return TFTransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlnet' in pretrained_model_name_or_path:
            return TFXLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlm' in pretrained_model_name_or_path:
            return TFXLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
 class TFAutoModelWithLMHead(object):
    r"""
        :class:`~transformers.TFAutoModelWithLMHead` is a generic model class
        that will be instantiated as one of the language modeling model classes of the library
        when created with the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
        class method.
        The `from_pretrained()` method takes care of returning the correct model class instance
        using pattern matching on the `pretrained_model_name_or_path` string.
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
            - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
            - contains `bert`: TFBertForMaskedLM (Bert model)
            - contains `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
            - contains `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
            - contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
            - contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
            - contains `xlm`: TFXLMWithLMHeadModel (XLM model)
        This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        r""" Instantiates one of the language modeling model classes of the library
        from a pre-trained model configuration.
        The `from_pretrained()` method takes care of returning the correct model class instance
        using pattern matching on the `pretrained_model_name_or_path` string.
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
            - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
            - contains `bert`: TFBertForMaskedLM (Bert model)
            - contains `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
            - contains `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
            - contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
            - contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
            - contains `xlm`: TFXLMWithLMHeadModel (XLM model)
        Params:
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
            from_pt: (`Optional`) Boolean
                Set to True if the Checkpoint is a PyTorch checkpoint.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
            output_loading_info: (`optional`) boolean:
                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
            kwargs: (`optional`) Remaining dictionary of keyword arguments:
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
            model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = TFAutoModelWithLMHead.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
            model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
        """
        if 'distilbert' in pretrained_model_name_or_path:
            return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
            return TFBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'openai-gpt' in pretrained_model_name_or_path:
            return TFOpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'gpt2' in pretrained_model_name_or_path:
            return TFGPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'transfo-xl' in pretrained_model_name_or_path:
            return TFTransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlnet' in pretrained_model_name_or_path:
            return TFXLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlm' in pretrained_model_name_or_path:
            return TFXLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
 class TFAutoModelForSequenceClassification(object):
    r"""
        :class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class
        that will be instantiated as one of the sequence classification model classes of the library
        when created with the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
        class method.
        The `from_pretrained()` method takes care of returning the correct model class instance
        using pattern matching on the `pretrained_model_name_or_path` string.
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
            - contains `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
            - contains `bert`: TFBertForSequenceClassification (Bert model)
            - contains `xlnet`: TFXLNetForSequenceClassification (XLNet model)
            - contains `xlm`: TFXLMForSequenceClassification (XLM model)
        This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        r""" Instantiates one of the sequence classification model classes of the library
        from a pre-trained model configuration.
        The `from_pretrained()` method takes care of returning the correct model class instance
        using pattern matching on the `pretrained_model_name_or_path` string.
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
            - contains `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
            - contains `bert`: TFBertForSequenceClassification (Bert model)
            - contains `xlnet`: TFXLNetForSequenceClassification (XLNet model)
            - contains `xlm`: TFXLMForSequenceClassification (XLM model)
        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
        To train the model, you should first set it back in training mode with `model.train()`
        Params:
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
            from_pt: (`Optional`) Boolean
                Set to True if the Checkpoint is a PyTorch checkpoint.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
            output_loading_info: (`optional`) boolean:
                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
            kwargs: (`optional`) Remaining dictionary of keyword arguments:
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
            model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = TFAutoModelForSequenceClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
            model = TFAutoModelForSequenceClassification.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
        """
        if 'distilbert' in pretrained_model_name_or_path:
            return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
            return TFBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlnet' in pretrained_model_name_or_path:
            return TFXLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlm' in pretrained_model_name_or_path:
            return TFXLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
 class TFAutoModelForQuestionAnswering(object):
    r"""
        :class:`~transformers.TFAutoModelForQuestionAnswering` is a generic model class
        that will be instantiated as one of the question answering model classes of the library
        when created with the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
        class method.
        The `from_pretrained()` method takes care of returning the correct model class instance
        using pattern matching on the `pretrained_model_name_or_path` string.
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
            - contains `bert`: TFBertForQuestionAnswering (Bert model)
            - contains `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
            - contains `xlm`: TFXLMForQuestionAnswering (XLM model)
        This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        r""" Instantiates one of the question answering model classes of the library
        from a pre-trained model configuration.
        The `from_pretrained()` method takes care of returning the correct model class instance
        using pattern matching on the `pretrained_model_name_or_path` string.
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
            - contains `bert`: TFBertForQuestionAnswering (Bert model)
            - contains `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
            - contains `xlm`: TFXLMForQuestionAnswering (XLM model)
        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
        To train the model, you should first set it back in training mode with `model.train()`
        Params:
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
            from_pt: (`Optional`) Boolean
                Set to True if the Checkpoint is a PyTorch checkpoint.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
            output_loading_info: (`optional`) boolean:
                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
            kwargs: (`optional`) Remaining dictionary of keyword arguments:
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
            model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = TFAutoModelForQuestionAnswering.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
            model = TFAutoModelForQuestionAnswering.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
        """
        if 'distilbert' in pretrained_model_name_or_path:
            return TFDistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
            return TFBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlnet' in pretrained_model_name_or_path:
            return TFXLNetForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlm' in pretrained_model_name_or_path:
            return TFXLMForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -0,0 +1,743 @@
 # coding=utf-8
 # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ TF 2.0 DistilBERT model
 """
 from __future__ import absolute_import, division, print_function, unicode_literals
 import json
 import logging
 import math
 import copy
 import sys
 from io import open
 import itertools
 import numpy as np
 import tensorflow as tf
 from .configuration_distilbert import DistilBertConfig
 from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer
 from .file_utils import add_start_docstrings
 from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 logger = logging.getLogger(__name__)
 TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5"
 }
 ### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
 def gelu(x):
    """ Gaussian Error Linear Unit.
    Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    """
    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
    return x * cdf
 def gelu_new(x):
    """Gaussian Error Linear Unit.
    This is a smoother version of the RELU.
    Original paper: https://arxiv.org/abs/1606.08415
    Args:
        x: float Tensor to perform activation.
    Returns:
        `x` with the GELU activation applied.
    """
    cdf = 0.5 * (1.0 + tf.tanh(
        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
    return x * cdf
 def load_distilbert_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
    # build the network
    inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
    attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
    tf_inputs = [inputs_list, attns_list]
    tfo = tf_model(tf_inputs, training=False)
    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
 class TFEmbeddings(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFEmbeddings, self).__init__(**kwargs)
        self.vocab_size = config.vocab_size
        self.dim = config.dim
        self.initializer_range = config.initializer_range
        self.word_embeddings = TFSharedEmbeddings(config.vocab_size,
                                                  config.dim,
                                                  initializer_range=config.initializer_range,
                                                  name='word_embeddings')  # padding_idx=0)
        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
                                                             config.dim,
                                                             embeddings_initializer=get_initializer(config.initializer_range),
                                                             name='position_embeddings')
        if config.sinusoidal_pos_embds:
            raise NotImplementedError
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(config.dropout)
    def build(self, input_shape):
        """Build shared word embedding layer """
        with tf.name_scope("word_embeddings"):
            # Create and initialize weights. The random normal initializer was chosen
            # arbitrarily, and works well.
            self.word_embeddings = self.add_weight(
                "weight",
                shape=[self.vocab_size, self.dim],
                initializer=get_initializer(self.initializer_range))
        super(TFEmbeddings, self).build(input_shape)
    def call(self, inputs, mode="embedding", training=False):
        """Get token embeddings of inputs.
        Args:
            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
            mode: string, a valid value is one of "embedding" and "linear".
        Returns:
            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
                linear tensor, float32 with shape [batch_size, length, vocab_size].
        Raises:
            ValueError: if mode is not valid.
        Shared weights logic adapted from
            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
        """
        if mode == "embedding":
            return self._embedding(inputs, training=training)
        elif mode == "linear":
            return self._linear(inputs)
        else:
            raise ValueError("mode {} is not valid.".format(mode))
    def _embedding(self, inputs, training=False):
        """
        Parameters
        ----------
        input_ids: tf.Tensor(bs, max_seq_length)
            The token ids to embed.
        Outputs
        -------
        embeddings: tf.Tensor(bs, max_seq_length, dim)
            The embedded tokens (plus position embeddings, no token_type embeddings)
        """
        if not isinstance(inputs, (tuple, list)):
            input_ids = inputs
            position_ids = None
        else:
            input_ids, position_ids = inputs
        seq_length = tf.shape(input_ids)[1]
        if position_ids is None:
            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
        word_embeddings = tf.gather(self.word_embeddings, input_ids)
        position_embeddings = self.position_embeddings(position_ids)  # (bs, max_seq_length, dim)
        embeddings = word_embeddings + position_embeddings            # (bs, max_seq_length, dim)
        embeddings = self.LayerNorm(embeddings)                       # (bs, max_seq_length, dim)
        embeddings = self.dropout(embeddings, training=training)      # (bs, max_seq_length, dim)
        return embeddings
    def _linear(self, inputs):
        """Computes logits by running inputs through a linear layer.
            Args:
                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
            Returns:
                float32 tensor with shape [batch_size, length, vocab_size].
        """
        batch_size = tf.shape(inputs)[0]
        length = tf.shape(inputs)[1]
        x = tf.reshape(inputs, [-1, self.dim])
        logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
        return tf.reshape(logits, [batch_size, length, self.vocab_size])
 class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFMultiHeadSelfAttention, self).__init__(**kwargs)
        self.n_heads = config.n_heads
        self.dim = config.dim
        self.dropout = tf.keras.layers.Dropout(config.attention_dropout)
        self.output_attentions = config.output_attentions
        assert self.dim % self.n_heads == 0
        self.q_lin = tf.keras.layers.Dense(config.dim,
                                           kernel_initializer=get_initializer(config.initializer_range),
                                           name="q_lin")
        self.k_lin = tf.keras.layers.Dense(config.dim,
                                           kernel_initializer=get_initializer(config.initializer_range),
                                           name="k_lin")
        self.v_lin = tf.keras.layers.Dense(config.dim,
                                           kernel_initializer=get_initializer(config.initializer_range),
                                           name="v_lin")
        self.out_lin = tf.keras.layers.Dense(config.dim,
                                           kernel_initializer=get_initializer(config.initializer_range),
                                           name="out_lin")
        self.pruned_heads = set()
    def prune_heads(self, heads):
        raise NotImplementedError
    def call(self, inputs, training=False):
        """
        Parameters
        ----------
        query: tf.Tensor(bs, seq_length, dim)
        key: tf.Tensor(bs, seq_length, dim)
        value: tf.Tensor(bs, seq_length, dim)
        mask: tf.Tensor(bs, seq_length)
        Outputs
        -------
        weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
            Attention weights
        context: tf.Tensor(bs, seq_length, dim)
            Contextualized layer. Optional: only if `output_attentions=True`
        """
        query, key, value, mask, head_mask = inputs
        bs, q_length, dim = shape_list(query)
        k_length = shape_list(key)[1]
        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
        # assert key.size() == value.size()
        dim_per_head = self.dim // self.n_heads
        assert 2 <= len(tf.shape(mask)) <= 3
        causal = (len(tf.shape(mask)) == 3)
        mask_reshape = [bs, 1, 1, k_length]
        def shape(x):
            """ separate heads """
            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
        def unshape(x):
            """ group heads """
            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
        q = shape(self.q_lin(query))           # (bs, n_heads, q_length, dim_per_head)
        k = shape(self.k_lin(key))             # (bs, n_heads, k_length, dim_per_head)
        v = shape(self.v_lin(value))           # (bs, n_heads, k_length, dim_per_head)
        q = q / math.sqrt(dim_per_head)                     # (bs, n_heads, q_length, dim_per_head)
        scores = tf.matmul(q, k, transpose_b=True)          # (bs, n_heads, q_length, k_length)
        mask = tf.reshape(mask, mask_reshape)                           # (bs, n_heads, qlen, klen)
        # scores.masked_fill_(mask, -float('inf'))            # (bs, n_heads, q_length, k_length)
        scores = scores - 1e30 * (1.0 - mask)
        weights = tf.nn.softmax(scores, axis=-1)                              # (bs, n_heads, qlen, klen)
        weights = self.dropout(weights, training=training)                    # (bs, n_heads, qlen, klen)
        # Mask heads if we want to
        if head_mask is not None:
            weights = weights * head_mask
        context = tf.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
        context = unshape(context)             # (bs, q_length, dim)
        context = self.out_lin(context)        # (bs, q_length, dim)
        if self.output_attentions:
            return (context, weights)
        else:
            return (context,)
 class TFFFN(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFFFN, self).__init__(**kwargs)
        self.dropout = tf.keras.layers.Dropout(config.dropout)
        self.lin1 = tf.keras.layers.Dense(config.hidden_dim,
                                          kernel_initializer=get_initializer(config.initializer_range),
                                          name="lin1")
        self.lin2 = tf.keras.layers.Dense(config.dim,
                                          kernel_initializer=get_initializer(config.initializer_range),
                                          name="lin2")
        assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
        self.activation = tf.keras.layers.Activation(gelu) if config.activation=='gelu' else tf.keras.activations.relu
    def call(self, input, training=False):
        x = self.lin1(input)
        x = self.activation(x)
        x = self.lin2(x)
        x = self.dropout(x, training=training)
        return x
 class TFTransformerBlock(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFTransformerBlock, self).__init__(**kwargs)
        self.n_heads = config.n_heads
        self.dim = config.dim
        self.hidden_dim = config.hidden_dim
        self.dropout = tf.keras.layers.Dropout(config.dropout)
        self.activation = config.activation
        self.output_attentions = config.output_attentions
        assert config.dim % config.n_heads == 0
        self.attention = TFMultiHeadSelfAttention(config, name="attention")
        self.sa_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm")
        self.ffn = TFFFN(config, name="ffn")
        self.output_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm")
    def call(self, inputs, training=False):  # removed: src_enc=None, src_len=None
        """
        Parameters
        ----------
        x: tf.Tensor(bs, seq_length, dim)
        attn_mask: tf.Tensor(bs, seq_length)
        Outputs
        -------
        sa_weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
            The attention weights
        ffn_output: tf.Tensor(bs, seq_length, dim)
            The output of the transformer block contextualization.
        """
        x, attn_mask, head_mask = inputs
        # Self-Attention
        sa_output = self.attention([x, x, x, attn_mask, head_mask], training=training)
        if self.output_attentions:
            sa_output, sa_weights = sa_output                  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
        else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples
            # assert type(sa_output) == tuple
            sa_output = sa_output[0]
        sa_output = self.sa_layer_norm(sa_output + x)          # (bs, seq_length, dim)
        # Feed Forward Network
        ffn_output = self.ffn(sa_output, training=training)                             # (bs, seq_length, dim)
        ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)
        output = (ffn_output,)
        if self.output_attentions:
            output = (sa_weights,) + output
        return output
 class TFTransformer(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFTransformer, self).__init__(**kwargs)
        self.n_layers = config.n_layers
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
        self.layer = [TFTransformerBlock(config, name='layer_._{}'.format(i))
                      for i in range(config.n_layers)]
    def call(self, inputs, training=False):
        """
        Parameters
        ----------
        x: tf.Tensor(bs, seq_length, dim)
            Input sequence embedded.
        attn_mask: tf.Tensor(bs, seq_length)
            Attention mask on the sequence.
        Outputs
        -------
        hidden_state: tf.Tensor(bs, seq_length, dim)
            Sequence of hiddens states in the last (top) layer
        all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)]
            Tuple of length n_layers with the hidden states from each layer.
            Optional: only if output_hidden_states=True
        all_attentions: Tuple[tf.Tensor(bs, n_heads, seq_length, seq_length)]
            Tuple of length n_layers with the attention weights from each layer
            Optional: only if output_attentions=True
        """
        x, attn_mask, head_mask = inputs
        all_hidden_states = ()
        all_attentions = ()
        hidden_state = x
        for i, layer_module in enumerate(self.layer):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_state,)
            layer_outputs = layer_module([hidden_state, attn_mask, head_mask[i]], training=training)
            hidden_state = layer_outputs[-1]
            if self.output_attentions:
                assert len(layer_outputs) == 2
                attentions = layer_outputs[0]
                all_attentions = all_attentions + (attentions,)
            else:
                assert len(layer_outputs) == 1
        # Add last layer
        if self.output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_state,)
        outputs = (hidden_state,)
        if self.output_hidden_states:
            outputs = outputs + (all_hidden_states,)
        if self.output_attentions:
            outputs = outputs + (all_attentions,)
        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
 class TFDistilBertMainLayer(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFDistilBertMainLayer, self).__init__(**kwargs)
        self.num_hidden_layers = config.num_hidden_layers
        self.embeddings = TFEmbeddings(config, name="embeddings")   # Embeddings
        self.transformer = TFTransformer(config, name="transformer") # Encoder
    def _resize_token_embeddings(self, new_num_tokens):
        raise NotImplementedError
    def _prune_heads(self, heads_to_prune):
        raise NotImplementedError
    def call(self, inputs, attention_mask=None, head_mask=None, training=False):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
            head_mask = inputs[2] if len(inputs) > 2 else head_mask
            assert len(inputs) <= 3, "Too many inputs."
        elif isinstance(inputs, dict):
            input_ids = inputs.get('input_ids')
            attention_mask = inputs.get('attention_mask', attention_mask)
            head_mask = inputs.get('head_mask', head_mask)
            assert len(inputs) <= 3, "Too many inputs."
        else:
            input_ids = inputs
        if attention_mask is None:
            attention_mask = tf.ones(shape_list(input_ids)) # (bs, seq_length)
        attention_mask = tf.cast(attention_mask, dtype=tf.float32)
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        if head_mask is not None:
            raise NotImplementedError
        else:
            head_mask = [None] * self.num_hidden_layers
        embedding_output = self.embeddings(input_ids)   # (bs, seq_length, dim)
        tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training)
        return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions)
 ### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
 class TFDistilBertPreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    """
    config_class = DistilBertConfig
    pretrained_model_archive_map = TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
    load_pt_weights = load_distilbert_pt_weights_in_tf2
    base_model_prefix = "distilbert"
 DISTILBERT_START_DOCSTRING = r"""
    DistilBERT is a small, fast, cheap and light Transformer model
    trained by distilling Bert base. It has 40% less parameters than
    `bert-base-uncased`, runs 60% faster while preserving over 95% of
    Bert's performances as measured on the GLUE language understanding benchmark.
    Here are the differences between the interface of Bert and DistilBert:
    - DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
    - DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.
    For more information on DistilBERT, please refer to our
    `detailed blog post`_
    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
    .. _`detailed blog post`:
        https://medium.com/huggingface/distilbert-8cf3380435b5
    .. _`tf.keras.Model`:
        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
    Note on the model inputs:
        TF 2.0 models accepts two formats as inputs:
            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.
        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
    Parameters:
        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 DISTILBERT_INPUTS_DOCSTRING = r"""
    Inputs:
        **input_ids** ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            Indices of input sequence tokens in the vocabulary.
            The input sequences should start with `[CLS]` and end with `[SEP]` tokens.
            For now, ONLY BertTokenizer(`bert-base-uncased`) is supported and you should use this tokenizer when using DistilBERT.
        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
 class TFDistilBertModel(TFDistilBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
            Sequence of hidden-states at the output of the last layer of the model.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import DistilBertTokenizer, TFDistilBertModel
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFDistilBertModel, self).__init__(config, *inputs, **kwargs)
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")   # Embeddings
    def call(self, inputs, **kwargs):
        outputs = self.distilbert(inputs, **kwargs)
        return outputs
 class TFDistilBertLMHead(tf.keras.layers.Layer):
    def __init__(self, config, input_embeddings, **kwargs):
        super(TFDistilBertLMHead, self).__init__(**kwargs)
        self.vocab_size = config.vocab_size
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.input_embeddings = input_embeddings
    def build(self, input_shape):
        self.bias = self.add_weight(shape=(self.vocab_size,),
                                    initializer='zeros',
                                    trainable=True,
                                    name='bias')
        super(TFDistilBertLMHead, self).build(input_shape)
    def call(self, hidden_states):
        hidden_states = self.input_embeddings(hidden_states, mode="linear")
        hidden_states = hidden_states + self.bias
        return hidden_states
@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
 class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        prediction_scores = outputs[0]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFDistilBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
        self.vocab_size = config.vocab_size
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
        self.vocab_transform = tf.keras.layers.Dense(config.dim,
                                                     kernel_initializer=get_initializer(config.initializer_range),
                                                     name="vocab_transform")
        self.act = tf.keras.layers.Activation(gelu)
        self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
        self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
    def call(self, inputs, **kwargs):
        distilbert_output = self.distilbert(inputs, **kwargs)
        hidden_states = distilbert_output[0]                               # (bs, seq_length, dim)
        prediction_logits = self.vocab_transform(hidden_states)       # (bs, seq_length, dim)
        prediction_logits = self.act(prediction_logits)               # (bs, seq_length, dim)
        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
        prediction_logits = self.vocab_projector(prediction_logits)
        outputs = (prediction_logits,) + distilbert_output[1:]
        return outputs  # logits, (hidden_states), (attentions)
@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
                         the pooled output) e.g. for GLUE tasks. """,
                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
 class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import BertTokenizer, TFDistilBertForSequenceClassification
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        logits = outputs[0]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFDistilBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
        self.pre_classifier = tf.keras.layers.Dense(config.dim,
                                                    kernel_initializer=get_initializer(config.initializer_range),
                                                    activation='relu',
                                                    name="pre_classifier")
        self.classifier = tf.keras.layers.Dense(config.num_labels,
                                                kernel_initializer=get_initializer(config.initializer_range),
                                                name="classifier")
        self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
    def call(self, inputs, **kwargs):
        distilbert_output = self.distilbert(inputs, **kwargs)
        hidden_state = distilbert_output[0]                    # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]                    # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)   # (bs, dim)
        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))         # (bs, dim)
        logits = self.classifier(pooled_output)              # (bs, dim)
        outputs = (logits,) + distilbert_output[1:]
        return outputs  # logits, (hidden_states), (attentions)
@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
                         the hidden-states output to compute `span start logits` and `span end logits`). """,
                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
 class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **start_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
            Span-start scores (before SoftMax).
        **end_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
            Span-end scores (before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import BertTokenizer, TFDistilBertForQuestionAnswering
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        start_scores, end_scores = outputs[:2]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
                                                kernel_initializer=get_initializer(config.initializer_range),
                                                name='qa_outputs')
        assert config.num_labels == 2
        self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
    def call(self, inputs, **kwargs):
        distilbert_output = self.distilbert(inputs, **kwargs)
        hidden_states = distilbert_output[0]                                 # (bs, max_query_len, dim)
        hidden_states = self.dropout(hidden_states, training=kwargs.get('training', False))                       # (bs, max_query_len, dim)
        logits = self.qa_outputs(hidden_states)                           # (bs, max_query_len, 2)
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)
        outputs = (start_logits, end_logits,) + distilbert_output[1:]
        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,3 @@`
							`from .utils import InputExample, InputFeatures, DataProcessor`
							`from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features`