Merge branch 'master' of https://github.com/danai-antoniou/pytorch-transformers into add-duplicate-tokens-error

2019-10-02 07:38:50 +01:00
parent 2e6797cc7d 391db836ab
commit d73957899a
158 changed files with 12920 additions and 1552 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,33 +1,77 @@
 version: 2
 jobs:
-    build_py3:
+    build_py3_torch_and_tf:
-        working_directory: ~/pytorch-transformers
+        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
            - run: sudo pip install torch
            - run: sudo pip install tensorflow==2.0.0-rc0
            - run: sudo pip install --progress-bar off .
            - run: sudo pip install pytest codecov pytest-cov
            - run: sudo pip install tensorboardX scikit-learn
-            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
+            - run: python -m pytest -sv ./transformers/tests/ --cov
            - run: codecov
    build_py3_torch:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
            - run: sudo pip install torch
            - run: sudo pip install --progress-bar off .
            - run: sudo pip install pytest codecov pytest-cov
            - run: sudo pip install tensorboardX scikit-learn
            - run: python -m pytest -sv ./transformers/tests/ --cov
            - run: python -m pytest -sv ./examples/
            - run: codecov
-    build_py2:
+    build_py3_tf:
-        working_directory: ~/pytorch-transformers
+        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
            - run: sudo pip install tensorflow==2.0.0-rc0
            - run: sudo pip install --progress-bar off .
            - run: sudo pip install pytest codecov pytest-cov
            - run: sudo pip install tensorboardX scikit-learn
            - run: python -m pytest -sv ./transformers/tests/ --cov
            - run: codecov
    build_py2_torch:
        working_directory: ~/transformers
        resource_class: large
        parallelism: 1
        docker:
            - image: circleci/python:2.7
        steps:
            - checkout
            - run: sudo pip install torch
            - run: sudo pip install --progress-bar off .
            - run: sudo pip install pytest codecov pytest-cov
-            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
+            - run: python -m pytest -sv ./transformers/tests/ --cov
            - run: codecov
    build_py2_tf:
        working_directory: ~/transformers
        resource_class: large
        parallelism: 1
        docker:
            - image: circleci/python:2.7
        steps:
            - checkout
            - run: sudo pip install tensorflow==2.0.0-rc0
            - run: sudo pip install --progress-bar off .
            - run: sudo pip install pytest codecov pytest-cov
            - run: python -m pytest -sv ./transformers/tests/ --cov
            - run: codecov
    deploy_doc:
-        working_directory: ~/pytorch-transformers
+        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.5
        steps:
@@ -37,7 +81,6 @@ jobs:
            - checkout
            - run: sudo pip install --progress-bar off -r docs/requirements.txt
            - run: sudo pip install --progress-bar off -r requirements.txt
            - run: cd docs/source && ln -s ../../examples/README.md examples.md && cd -
            - run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
 workflow_filters: &workflow_filters
    filters:
@@ -48,6 +91,9 @@ workflows:
    version: 2
    build_and_test:
        jobs:
-            - build_py3
+            - build_py3_torch_and_tf
-            - build_py2
+            - build_py3_torch
            - build_py3_tf
            - build_py2_torch
            - build_py2_tf
            - deploy_doc: *workflow_filters
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,5 @@
 [run]
-source=pytorch_transformers
+source=transformers
 omit =
    # skip convertion scripts from testing for now
    */convert_*
--- a/.github/ISSUE_TEMPLATE/migration.md
+++ b/.github/ISSUE_TEMPLATE/migration.md
@@ -1,6 +1,6 @@
 ---
 name: "\U0001F4DA Migration from PyTorch-pretrained-Bert"
-about: Report a problem when migrating from PyTorch-pretrained-Bert to PyTorch-Transformers
+about: Report a problem when migrating from PyTorch-pretrained-Bert to Transformers
 ---
 ## 📚 Migration
--- a/.gitignore
+++ b/.gitignore
@@ -130,5 +130,5 @@ runs
 examples/runs
 # data
-data
+/data
 serialization_dir
--- a/README.md
+++ b/README.md
@@ -1,47 +1,86 @@
-# 👾 PyTorch-Transformers
+<p align="center">
    <br>
    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
    <br>
 <p>
 <p align="center">
    <a href="https://circleci.com/gh/huggingface/transformers">
        <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/master">
    </a>
    <a href="https://github.com/huggingface/transformers/blob/master/LICENSE">
        <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
    </a>
    <a href="https://huggingface.co/transformers/index.html">
        <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/transformers/index.html.svg?down_color=red&down_message=offline&up_message=online">
    </a>
    <a href="https://github.com/huggingface/transformers/releases">
        <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
    </a>
 </p>
-[![CircleCI](https://circleci.com/gh/huggingface/pytorch-transformers.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-transformers)
+<h3 align="center">
 <p>State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
 </h3>
-PyTorch-Transformers (formerly known as `pytorch-pretrained-bert`) is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
+🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
-The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
+### Features
-1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+- As easy to use as pytorch-transformers
-2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+- As powerful and concise as Keras
-3. **[GPT-2](https://blog.openai.com/better-language-models/)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+- High performance on NLU and NLG tasks
-4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+- Low barrier to entry for educators and practitioners
-5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+
-6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+State-of-the-art NLP for everyone
-7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+- Deep learning researchers
-8. **[DistilBERT](https://github.com/huggingface/pytorch-transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5
+- Hands-on practitioners
-) by Victor Sanh, Lysandre Debut and Thomas Wolf.
+- AI/ML/NLP teachers and educators
 Lower compute costs, smaller carbon footprint
 - Researchers can share trained models instead of always retraining
 - Practitioners can reduce compute time and production costs
 - 8 architectures with over 30 pretrained models, some in more than 100 languages
 Choose the right framework for every part of a model's lifetime
 - Train state-of-the-art models in 3 lines of code
 - Deep interoperability between TensorFlow 2.0 and PyTorch models
 - Move a single model between TF2.0/PyTorch frameworks at will
 - Seamlessly pick the right framework for training, evaluation, production
 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/pytorch-transformers/examples.html).
 | Section | Description |
 |-|-|
 | [Installation](#installation) | How to install the package |
 | [Model architectures](#model-architectures) | Architectures (with pretrained weights) |
 | [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
 | [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
 | [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
 | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
-| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers |
+| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
-| [Documentation](https://huggingface.co/pytorch-transformers/) | Full API documentation and more |
+| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
 | [Documentation](https://huggingface.co/transformers/) | Full API documentation and more |
 ## Installation
-This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.0.0+
+This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+), PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
 ### With pip
-PyTorch-Transformers can be installed by pip as follows:
+First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
 Please refere to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
 When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
 ```bash
-pip install pytorch-transformers
+pip install transformers
 ```
 ### From source
-Clone the repository and run:
+Here also, you first need to install one of, or both, TensorFlow 2.0 and PyTorch.
 Please refere to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
 When TensorFlow 2.0 and/or PyTorch has been installed, you can install from source by cloning the repository and running:
 ```bash
 pip install [--editable] .
@@ -49,14 +88,16 @@ pip install [--editable] .
 ### Tests
-A series of tests is included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/pytorch-transformers/tree/master/examples).
+A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
 These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
 Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
 You can run the tests from the root of the cloned repository with the commands:
 ```bash
-python -m pytest -sv ./pytorch_transformers/tests/
+python -m pytest -sv ./transformers/tests/
 python -m pytest -sv ./examples/
 ```
@@ -66,8 +107,23 @@ You should check out our [`swift-coreml-transformers`](https://github.com/huggin
 It contains an example of a conversion script from a Pytorch trained Transformer model (here, `GPT-2`) to a CoreML model that runs on iOS devices.
-At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
+At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models to productizing them in CoreML, or prototype a model or an app in CoreML then research its hyperparameters or architecture from TensorFlow 2.0 and/or PyTorch. Super exciting!
-or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
+
 ## Model architectures
 🤗 Transformers currently provides 8 NLU/NLG architectures:
 1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 3. **[GPT-2](https://blog.openai.com/better-language-models/)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5
 ) by Victor Sanh, Lysandre Debut and Thomas Wolf.
 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
 ## Online demo
@@ -80,22 +136,25 @@ You can use it to experiment with completions generated by `GPT2Model`, `Transfo
 ## Quick tour
-Let's do a very quick overview of PyTorch-Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/pytorch-transformers/).
+Let's do a very quick overview of the model architectures in 🤗 Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/transformers/).
 ```python
 import torch
-from pytorch_transformers import *
+from transformers import *
-# PyTorch-Transformers has a unified API
+# Transformers has a unified API
-# for 7 transformer architectures and 30 pretrained weights.
+# for 8 transformer architectures and 30 pretrained weights.
 #          Model          | Tokenizer          | Pretrained weights shortcut
-MODELS = [(BertModel,       BertTokenizer,      'bert-base-uncased'),
+MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
-          (OpenAIGPTModel,  OpenAIGPTTokenizer, 'openai-gpt'),
+          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
-          (GPT2Model,       GPT2Tokenizer,      'gpt2'),
+          (GPT2Model,       GPT2Tokenizer,       'gpt2'),
-          (TransfoXLModel,  TransfoXLTokenizer, 'transfo-xl-wt103'),
+          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
-          (XLNetModel,      XLNetTokenizer,     'xlnet-base-cased'),
+          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
-          (XLMModel,        XLMTokenizer,       'xlm-mlm-enfr-1024'),
+          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
-          (RobertaModel,    RobertaTokenizer,   'roberta-base')]
+          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
          (RobertaModel,    RobertaTokenizer,    'roberta-base')]
 # To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`
 # Let's encode some text in a sequence of hidden-states using each model:
 for model_class, tokenizer_class, pretrained_weights in MODELS:
@@ -121,24 +180,71 @@ for model_class in BERT_MODEL_CLASSES:
    # Load pretrained model/tokenizer
    model = model_class.from_pretrained('bert-base-uncased')
-# Models can return full list of hidden-states & attentions weights at each layer
+    # Models can return full list of hidden-states & attentions weights at each layer
-model = model_class.from_pretrained(pretrained_weights,
+    model = model_class.from_pretrained(pretrained_weights,
-                                    output_hidden_states=True,
+                                        output_hidden_states=True,
-                                    output_attentions=True)
+                                        output_attentions=True)
-input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
+    input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
-all_hidden_states, all_attentions = model(input_ids)[-2:]
+    all_hidden_states, all_attentions = model(input_ids)[-2:]
-# Models are compatible with Torchscript
+    # Models are compatible with Torchscript
-model = model_class.from_pretrained(pretrained_weights, torchscript=True)
+    model = model_class.from_pretrained(pretrained_weights, torchscript=True)
-traced_model = torch.jit.trace(model, (input_ids,))
+    traced_model = torch.jit.trace(model, (input_ids,))
-# Simple serialization for models and tokenizers
+    # Simple serialization for models and tokenizers
-model.save_pretrained('./directory/to/save/')  # save
+    model.save_pretrained('./directory/to/save/')  # save
-model = model_class.from_pretrained('./directory/to/save/')  # re-load
+    model = model_class.from_pretrained('./directory/to/save/')  # re-load
-tokenizer.save_pretrained('./directory/to/save/')  # save
+    tokenizer.save_pretrained('./directory/to/save/')  # save
-tokenizer = tokenizer_class.from_pretrained('./directory/to/save/')  # re-load
+    tokenizer = BertTokenizer.from_pretrained('./directory/to/save/')  # re-load
-# SOTA examples for GLUE, SQUAD, text generation...
+    # SOTA examples for GLUE, SQUAD, text generation...
 ```
 ## Quick tour TF 2.0 training and PyTorch interoperability
 Let's do a quick example of how a TensorFlow 2.0 model can be trained in 12 lines of code with 🤗 Transformers and then loaded in PyTorch for fast inspection/tests.
 ```python
 import tensorflow as tf
 import tensorflow_datasets
 from transformers import *
 # Load dataset, tokenizer, model from pretrained model/vocabulary
 tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
 model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
 data = tensorflow_datasets.load('glue/mrpc')
 # Prepare dataset for GLUE as a tf.data.Dataset instance
 train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
 valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
 train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
 valid_dataset = valid_dataset.batch(64)
 # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
 optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
 loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
 model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
 # Train and evaluate using tf.keras.Model.fit()
 history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
                    validation_data=valid_dataset, validation_steps=7)
 # Load the TensorFlow model in PyTorch for inspection
 model.save_pretrained('./save/')
 pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
 # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
 sentence_0 = "This research was consistent with his findings."
 sentence_1 = "His findings were compatible with this research."
 sentence_2 = "His findings were not compatible with this research."
 inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
 inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
 pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
 pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
 print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
 print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
 ```
 ## Quick tour of the fine-tuning/usage scripts
@@ -288,7 +394,7 @@ This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-s
 ### `run_generation.py`: Text generation with GPT, GPT-2, Transformer-XL and XLNet
 A conditional generation script is also included to generate text from a prompt.
-The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by Aman Rusia to get high quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
+The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by Aman Rusia to get high-quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
 Here is how to run the script with the small version of OpenAI GPT-2 model:
@@ -299,19 +405,32 @@ python ./examples/run_generation.py \
    --model_name_or_path=gpt2 \
 ```
-## Migrating from pytorch-pretrained-bert to pytorch-transformers
+## Migrating from pytorch-transformers to transformers
-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers`
+Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
 ### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
 To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
 If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
 If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
 ## Migrating from pytorch-pretrained-bert to transformers
 Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`.
 ### Models always output `tuples`
-The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
-The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
+The exact content of the tuples for each model is detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
-Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model:
+Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
 ```python
 # Let's load our model
@@ -320,11 +439,11 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 # If you used to have this line in pytorch-pretrained-bert:
 loss = model(input_ids, labels=labels)
-# Now just use this line in pytorch-transformers to extract the loss from the output tuple:
+# Now just use this line in transformers to extract the loss from the output tuple:
 outputs = model(input_ids, labels=labels)
 loss = outputs[0]
-# In pytorch-transformers you can also have access to the logits:
+# In transformers you can also have access to the logits:
 loss, logits = outputs[:2]
 # And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
@@ -333,13 +452,17 @@ outputs = model(input_ids, labels=labels)
 loss, logits, attentions = outputs
 ```
 ### Using hidden states
 By enabling the configuration option `output_hidden_states`, it was possible to retrieve the last hidden states of the encoder. In `pytorch-transformers` as well as `transformers` the return value has changed slightly: `all_hidden_states` now also includes the hidden state of the embeddings in addition to those of the encoding layers. This allows users to easily access the embeddings final state.
 ### Serialization
-Breaking change in the `from_pretrained()`method:
+Breaking change in the `from_pretrained()` method:
 1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
-2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/pytorch-transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
+2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
 Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
@@ -396,7 +519,7 @@ for batch in train_data:
    loss.backward()
    optimizer.step()
-### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
+### In Transformers, optimizer and schedules are splitted and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
 scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
 ### and used like this:
@@ -411,4 +534,4 @@ for batch in train_data:
 ## Citation
-At the moment, there is no paper associated to PyTorch-Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
+At the moment, there is no paper associated with Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -2,6 +2,6 @@ FROM pytorch/pytorch:latest
 RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
-RUN pip install pytorch_transformers
+RUN pip install transformers
 WORKDIR /workspace
--- a/docs/README.md
+++ b/docs/README.md
@@ -34,11 +34,11 @@ pip install recommonmark
 ## Building the documentation
-Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the followig 
+Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following 
 command to generate it:
 ```bash
-ln -s ../../examples/README.md source/examples.md
+ln -s ../../examples/README.md examples.md
 ```
 Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -26,4 +26,7 @@ sphinxcontrib-jsmath==1.0.1
 sphinxcontrib-qthelp==1.0.2
 sphinxcontrib-serializinghtml==1.1.3
 urllib3==1.25.3
-sphinx-markdown-tables==0.0.9
+sphinx-markdown-tables==0.0.9
 numpy==1.17.2
 tensorflow==2.0.0rc2
 torch==1.2.0
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
--- a/docs/source/bertology.rst
+++ b/docs/source/bertology.rst
@@ -15,4 +15,4 @@ In order to help this new field develop, we have included a few additional featu
 * accessing all the attention weights for each head of BERT/GPT/GPT-2,
 * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
-To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
+To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -19,14 +19,14 @@ sys.path.insert(0, os.path.abspath('../..'))
 # -- Project information -----------------------------------------------------
-project = u'pytorch-transformers'
+project = u'transformers'
 copyright = u'2019, huggingface'
 author = u'huggingface'
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'1.2.0'
+release = u'2.0.0'
 # -- General configuration ---------------------------------------------------
@@ -109,7 +109,7 @@ html_static_path = ['_static']
 # -- Options for HTMLHelp output ---------------------------------------------
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'pytorch-transformersdoc'
+htmlhelp_basename = 'transformersdoc'
 # -- Options for LaTeX output ------------------------------------------------
@@ -136,7 +136,7 @@ latex_elements = {
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'pytorch-transformers.tex', u'pytorch-transformers Documentation',
+    (master_doc, 'transformers.tex', u'transformers Documentation',
     u'huggingface', 'manual'),
 ]
@@ -146,7 +146,7 @@ latex_documents = [
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    (master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
+    (master_doc, 'transformers', u'transformers Documentation',
     [author], 1)
 ]
@@ -157,8 +157,8 @@ man_pages = [
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
+    (master_doc, 'transformers', u'transformers Documentation',
-     author, 'pytorch-transformers', 'One line description of project.',
+     author, 'transformers', 'One line description of project.',
     'Miscellaneous'),
 ]
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -6,7 +6,7 @@ A command-line interface is provided to convert original Bert/GPT/GPT-2/Transfor
 BERT
 ^^^^
-You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
+You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
 This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).
@@ -20,7 +20,7 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas
   export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
-   pytorch_transformers bert \
+   transformers bert \
     $BERT_BASE_DIR/bert_model.ckpt \
     $BERT_BASE_DIR/bert_config.json \
     $BERT_BASE_DIR/pytorch_model.bin
@@ -36,7 +36,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
   export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
-   pytorch_transformers gpt \
+   transformers gpt \
     $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
     $PYTORCH_DUMP_OUTPUT \
     [OPENAI_GPT_CONFIG]
@@ -50,7 +50,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT-2 mode
   export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
-   pytorch_transformers gpt2 \
+   transformers gpt2 \
     $OPENAI_GPT2_CHECKPOINT_PATH \
     $PYTORCH_DUMP_OUTPUT \
     [OPENAI_GPT2_CONFIG]
@@ -64,7 +64,7 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
   export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
-   pytorch_transformers transfo_xl \
+   transformers transfo_xl \
     $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
     $PYTORCH_DUMP_OUTPUT \
     [TRANSFO_XL_CONFIG]
@@ -80,7 +80,7 @@ Here is an example of the conversion process for a pre-trained XLNet model, fine
   export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
   export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
-   pytorch_transformers xlnet \
+   transformers xlnet \
     $TRANSFO_XL_CHECKPOINT_PATH \
     $TRANSFO_XL_CONFIG_PATH \
     $PYTORCH_DUMP_OUTPUT \
@@ -96,6 +96,6 @@ Here is an example of the conversion process for a pre-trained XLM model:
   export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
-   pytorch_transformers xlm \
+   transformers xlm \
     $XLM_CHECKPOINT_PATH \
     $PYTORCH_DUMP_OUTPUT \
--- a/docs/source/examples.md
+++ b/docs/source/examples.md
@@ -0,0 +1 @@
 ../../examples/README.md
--- a/docs/source/imgs/transformers_logo_name.png
+++ b/docs/source/imgs/transformers_logo_name.png
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,9 +1,43 @@
-Pytorch-Transformers
+Transformers
 ================================================================================================================================================
-PyTorch-Transformers is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
+🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose architectures
 (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation
 (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
-The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
+This is the documentation of our repository `transformers <https://github.com/huggingface/transformers>`__.
 Features
 ---------------------------------------------------
 - As easy to use as pytorch-transformers
 - As powerful and concise as Keras
 - High performance on NLU and NLG tasks
 - Low barrier to entry for educators and practitioners
 State-of-the-art NLP for everyone:
 - Deep learning researchers
 - Hands-on practitioners
 - AI/ML/NLP teachers and educators
 Lower compute costs, smaller carbon footprint:
 - Researchers can share trained models instead of always retraining
 - Practitioners can reduce compute time and production costs
 - 8 architectures with over 30 pretrained models, some in more than 100 languages
 Choose the right framework for every part of a model's lifetime:
 - Train state-of-the-art models in 3 lines of code
 - Deep interoperability between TensorFlow 2.0 and PyTorch models
 - Move a single model between TF2.0/PyTorch frameworks at will
 - Seamlessly pick the right framework for training, evaluation, production
 Contents
 ---------------------------------
 The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
 1. `BERT <https://github.com/google-research/bert>`_ (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 2. `GPT <https://github.com/openai/finetune-transformer-lm>`_ (from OpenAI) released with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised>`_ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
@@ -12,7 +46,8 @@ The library currently contains PyTorch implementations, pre-trained model weight
 5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
 7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
+8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
 .. toctree::
    :maxdepth: 2
@@ -37,6 +72,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
    main_classes/model
    main_classes/tokenizer
    main_classes/optimizer_schedules
    main_classes/processors
 .. toctree::
    :maxdepth: 2
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -1,7 +1,7 @@
 Installation
 ================================================
-PyTorch-Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
+Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
 With pip
 ^^^^^^^^
@@ -10,7 +10,7 @@ PyTorch Transformers can be installed using pip as follows:
 .. code-block:: bash
-   pip install pytorch-transformers
+   pip install transformers
 From source
 ^^^^^^^^^^^
@@ -19,15 +19,15 @@ To install from source, clone the repository and install with:
 .. code-block:: bash
-    git clone https://github.com/huggingface/pytorch-transformers.git
+    git clone https://github.com/huggingface/transformers.git
-    cd pytorch-transformers
+    cd transformers
    pip install [--editable] .
 Tests
 ^^^^^
-An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`_.
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/transformers/tree/master/transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/transformers/tree/master/examples>`_.
 Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
@@ -35,7 +35,7 @@ Run all the tests from the root of the cloned repository with the commands:
 .. code-block:: bash
-    python -m pytest -sv ./pytorch_transformers/tests/
+    python -m pytest -sv ./transformers/tests/
    python -m pytest -sv ./examples/
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@@ -6,5 +6,5 @@ The base class ``PretrainedConfig`` implements the common methods for loading/sa
 ``PretrainedConfig``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.PretrainedConfig
+.. autoclass:: transformers.PretrainedConfig
    :members:
--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@@ -11,5 +11,11 @@ The base class ``PreTrainedModel`` implements the common methods for loading/sav
 ``PreTrainedModel``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.PreTrainedModel
+.. autoclass:: transformers.PreTrainedModel
    :members:
 ``TFPreTrainedModel``
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFPreTrainedModel
    :members:
--- a/docs/source/main_classes/optimizer_schedules.rst
+++ b/docs/source/main_classes/optimizer_schedules.rst
@@ -9,7 +9,7 @@ The ``.optimization`` module provides:
 ``AdamW``
 ~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.AdamW
+.. autoclass:: transformers.AdamW
    :members:
 Schedules
@@ -18,11 +18,11 @@ Schedules
 Learning Rate Schedules
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. autoclass:: pytorch_transformers.ConstantLRSchedule
+.. autoclass:: transformers.ConstantLRSchedule
    :members:
-.. autoclass:: pytorch_transformers.WarmupConstantSchedule
+.. autoclass:: transformers.WarmupConstantSchedule
    :members:
 .. image:: /imgs/warmup_constant_schedule.png
@@ -30,7 +30,7 @@ Learning Rate Schedules
    :alt:
-.. autoclass:: pytorch_transformers.WarmupCosineSchedule
+.. autoclass:: transformers.WarmupCosineSchedule
    :members:
 .. image:: /imgs/warmup_cosine_schedule.png
@@ -38,7 +38,7 @@ Learning Rate Schedules
    :alt:
-.. autoclass:: pytorch_transformers.WarmupCosineWithHardRestartsSchedule
+.. autoclass:: transformers.WarmupCosineWithHardRestartsSchedule
    :members:
 .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
@@ -47,7 +47,7 @@ Learning Rate Schedules
-.. autoclass:: pytorch_transformers.WarmupLinearSchedule
+.. autoclass:: transformers.WarmupLinearSchedule
    :members:
 .. image:: /imgs/warmup_linear_schedule.png
--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@@ -0,0 +1,58 @@
 Processors
 ----------------------------------------------------
 This library includes processors for several traditional tasks. These processors can be used to process a dataset into
 examples that can be fed to a model.
 Processors
 ~~~~~~~~~~~~~~~~~~~~~
 All processors follow the same architecture which is that of the
 :class:`~transformers.data.processors.utils.DataProcessor`. The processor returns a list
 of :class:`~transformers.data.processors.utils.InputExample`. These
 :class:`~transformers.data.processors.utils.InputExample` can be converted to
 :class:`~transformers.data.processors.utils.InputFeatures` in order to be fed to the model.
 .. autoclass:: transformers.data.processors.utils.DataProcessor
    :members:
 .. autoclass:: transformers.data.processors.utils.InputExample
    :members:
 .. autoclass:: transformers.data.processors.utils.InputFeatures
    :members:
 GLUE
 ~~~~~~~~~~~~~~~~~~~~~
 `General Language Understanding Evaluation (GLUE) <https://gluebenchmark.com/>`__ is a benchmark that evaluates
 the performance of models across a diverse set of existing NLU tasks. It was released together with the paper
 `GLUE: A multi-task benchmark and analysis platform for natural language understanding <https://openreview.net/pdf?id=rJ4km2R5t7>`__
 This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched),
 CoLA, SST2, STSB, QQP, QNLI, RTE and WNLI.
 Those processors are:
    - :class:`~transformers.data.processors.utils.MrpcProcessor`
    - :class:`~transformers.data.processors.utils.MnliProcessor`
    - :class:`~transformers.data.processors.utils.MnliMismatchedProcessor`
    - :class:`~transformers.data.processors.utils.Sst2Processor`
    - :class:`~transformers.data.processors.utils.StsbProcessor`
    - :class:`~transformers.data.processors.utils.QqpProcessor`
    - :class:`~transformers.data.processors.utils.QnliProcessor`
    - :class:`~transformers.data.processors.utils.RteProcessor`
    - :class:`~transformers.data.processors.utils.WnliProcessor`
 Additionally, the following method  can be used to load values from a data file and convert them to a list of
 :class:`~transformers.data.processors.utils.InputExample`.
 .. automethod:: transformers.data.processors.glue.glue_convert_examples_to_features
 Example usage
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 An example using these processors is given in the
 `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -12,5 +12,5 @@ The base class ``PreTrainedTokenizer`` implements the common methods for loading
 ``PreTrainedTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.PreTrainedTokenizer
+.. autoclass:: transformers.PreTrainedTokenizer
    :members:
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -1,17 +1,17 @@
 # Migrating from pytorch-pretrained-bert
-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers`
+Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`
 ### Models always output `tuples`
-The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
-The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
+The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
-Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model:
+Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
 ```python
 # Let's load our model
@@ -20,11 +20,11 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 # If you used to have this line in pytorch-pretrained-bert:
 loss = model(input_ids, labels=labels)
-# Now just use this line in pytorch-transformers to extract the loss from the output tuple:
+# Now just use this line in transformers to extract the loss from the output tuple:
 outputs = model(input_ids, labels=labels)
 loss = outputs[0]
-# In pytorch-transformers you can also have access to the logits:
+# In transformers you can also have access to the logits:
 loss, logits = outputs[:2]
 # And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
@@ -96,7 +96,7 @@ for batch in train_data:
    loss.backward()
    optimizer.step()
-### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
+### In Transformers, optimizer and schedules are splitted and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
 scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
 ### and used like this:
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -11,19 +11,19 @@ Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will di
 ``AutoConfig``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.AutoConfig
+.. autoclass:: transformers.AutoConfig
    :members:
 ``AutoModel``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.AutoModel
+.. autoclass:: transformers.AutoModel
    :members:
 ``AutoTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.AutoTokenizer
+.. autoclass:: transformers.AutoTokenizer
    :members:
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -4,69 +4,125 @@ BERT
 ``BertConfig``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertConfig
+.. autoclass:: transformers.BertConfig
    :members:
 ``BertTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertTokenizer
+.. autoclass:: transformers.BertTokenizer
    :members:
 ``BertModel``
 ~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertModel
+.. autoclass:: transformers.BertModel
    :members:
 ``BertForPreTraining``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertForPreTraining
+.. autoclass:: transformers.BertForPreTraining
    :members:
 ``BertForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertForMaskedLM
+.. autoclass:: transformers.BertForMaskedLM
    :members:
 ``BertForNextSentencePrediction``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertForNextSentencePrediction
+.. autoclass:: transformers.BertForNextSentencePrediction
    :members:
 ``BertForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertForSequenceClassification
+.. autoclass:: transformers.BertForSequenceClassification
    :members:
 ``BertForMultipleChoice``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertForMultipleChoice
+.. autoclass:: transformers.BertForMultipleChoice
    :members:
 ``BertForTokenClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertForTokenClassification
+.. autoclass:: transformers.BertForTokenClassification
    :members:
 ``BertForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.BertForQuestionAnswering
+.. autoclass:: transformers.BertForQuestionAnswering
    :members:
 ``TFBertModel``
 ~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertModel
    :members:
 ``TFBertForPreTraining``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForPreTraining
    :members:
 ``TFBertForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForMaskedLM
    :members:
 ``TFBertForNextSentencePrediction``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForNextSentencePrediction
    :members:
 ``TFBertForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForSequenceClassification
    :members:
 ``TFBertForMultipleChoice``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForMultipleChoice
    :members:
 ``TFBertForTokenClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForTokenClassification
    :members:
 ``TFBertForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFBertForQuestionAnswering
    :members:
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -4,40 +4,67 @@ DistilBERT
 ``DistilBertConfig``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.DistilBertConfig
+.. autoclass:: transformers.DistilBertConfig
    :members:
 ``DistilBertTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.DistilBertTokenizer
+.. autoclass:: transformers.DistilBertTokenizer
    :members:
 ``DistilBertModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.DistilBertModel
+.. autoclass:: transformers.DistilBertModel
    :members:
 ``DistilBertForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.DistilBertForMaskedLM
+.. autoclass:: transformers.DistilBertForMaskedLM
    :members:
 ``DistilBertForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.DistilBertForSequenceClassification
+.. autoclass:: transformers.DistilBertForSequenceClassification
    :members:
 ``DistilBertForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.DistilBertForQuestionAnswering
+.. autoclass:: transformers.DistilBertForQuestionAnswering
    :members:
 ``TFDistilBertModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFDistilBertModel
    :members:
 ``TFDistilBertForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFDistilBertForMaskedLM
    :members:
 ``TFDistilBertForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFDistilBertForSequenceClassification
    :members:
 ``TFDistilBertForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFDistilBertForQuestionAnswering
    :members:
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -4,33 +4,54 @@ OpenAI GPT
 ``OpenAIGPTConfig``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.OpenAIGPTConfig
+.. autoclass:: transformers.OpenAIGPTConfig
    :members:
 ``OpenAIGPTTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.OpenAIGPTTokenizer
+.. autoclass:: transformers.OpenAIGPTTokenizer
    :members:
 ``OpenAIGPTModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.OpenAIGPTModel
+.. autoclass:: transformers.OpenAIGPTModel
    :members:
 ``OpenAIGPTLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.OpenAIGPTLMHeadModel
+.. autoclass:: transformers.OpenAIGPTLMHeadModel
    :members:
 ``OpenAIGPTDoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.OpenAIGPTDoubleHeadsModel
+.. autoclass:: transformers.OpenAIGPTDoubleHeadsModel
    :members:
 ``TFOpenAIGPTModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFOpenAIGPTModel
    :members:
 ``TFOpenAIGPTLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFOpenAIGPTLMHeadModel
    :members:
 ``TFOpenAIGPTDoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFOpenAIGPTDoubleHeadsModel
    :members:
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -4,33 +4,54 @@ OpenAI GPT2
 ``GPT2Config``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.GPT2Config
+.. autoclass:: transformers.GPT2Config
    :members:
 ``GPT2Tokenizer``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.GPT2Tokenizer
+.. autoclass:: transformers.GPT2Tokenizer
    :members:
 ``GPT2Model``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.GPT2Model
+.. autoclass:: transformers.GPT2Model
    :members:
 ``GPT2LMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.GPT2LMHeadModel
+.. autoclass:: transformers.GPT2LMHeadModel
    :members:
 ``GPT2DoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.GPT2DoubleHeadsModel
+.. autoclass:: transformers.GPT2DoubleHeadsModel
    :members:
 ``TFGPT2Model``
 ~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFGPT2Model
    :members:
 ``TFGPT2LMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFGPT2LMHeadModel
    :members:
 ``TFGPT2DoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFGPT2DoubleHeadsModel
    :members:
--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@@ -4,33 +4,54 @@ RoBERTa
 ``RobertaConfig``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.RobertaConfig
+.. autoclass:: transformers.RobertaConfig
    :members:
 ``RobertaTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.RobertaTokenizer
+.. autoclass:: transformers.RobertaTokenizer
    :members:
 ``RobertaModel``
 ~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.RobertaModel
+.. autoclass:: transformers.RobertaModel
    :members:
 ``RobertaForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.RobertaForMaskedLM
+.. autoclass:: transformers.RobertaForMaskedLM
    :members:
 ``RobertaForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.RobertaForSequenceClassification
+.. autoclass:: transformers.RobertaForSequenceClassification
    :members:
 ``TFRobertaModel``
 ~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFRobertaModel
    :members:
 ``TFRobertaForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFRobertaForMaskedLM
    :members:
 ``TFRobertaForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFRobertaForSequenceClassification
    :members:
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -5,26 +5,40 @@ Transformer XL
 ``TransfoXLConfig``
 ~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.TransfoXLConfig
+.. autoclass:: transformers.TransfoXLConfig
    :members:
 ``TransfoXLTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.TransfoXLTokenizer
+.. autoclass:: transformers.TransfoXLTokenizer
    :members:
 ``TransfoXLModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.TransfoXLModel
+.. autoclass:: transformers.TransfoXLModel
    :members:
 ``TransfoXLLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.TransfoXLLMHeadModel
+.. autoclass:: transformers.TransfoXLLMHeadModel
    :members:
 ``TFTransfoXLModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFTransfoXLModel
    :members:
 ``TFTransfoXLLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFTransfoXLLMHeadModel
    :members:
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -4,38 +4,66 @@ XLM
 ``XLMConfig``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLMConfig
+.. autoclass:: transformers.XLMConfig
    :members:
 ``XLMTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLMTokenizer
+.. autoclass:: transformers.XLMTokenizer
    :members:
 ``XLMModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLMModel
+.. autoclass:: transformers.XLMModel
    :members:
 ``XLMWithLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLMWithLMHeadModel
+.. autoclass:: transformers.XLMWithLMHeadModel
    :members:
 ``XLMForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLMForSequenceClassification
+.. autoclass:: transformers.XLMForSequenceClassification
    :members:
 ``XLMForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLMForQuestionAnswering
+.. autoclass:: transformers.XLMForQuestionAnswering
    :members:
 ``TFXLMModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLMModel
    :members:
 ``TFXLMWithLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLMWithLMHeadModel
    :members:
 ``TFXLMForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLMForSequenceClassification
    :members:
 ``TFXLMForQuestionAnsweringSimple``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLMForQuestionAnsweringSimple
    :members:
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -4,40 +4,68 @@ XLNet
 ``XLNetConfig``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLNetConfig
+.. autoclass:: transformers.XLNetConfig
    :members:
 ``XLNetTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLNetTokenizer
+.. autoclass:: transformers.XLNetTokenizer
    :members:
 ``XLNetModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLNetModel
+.. autoclass:: transformers.XLNetModel
    :members:
 ``XLNetLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLNetLMHeadModel
+.. autoclass:: transformers.XLNetLMHeadModel
    :members:
 ``XLNetForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLNetForSequenceClassification
+.. autoclass:: transformers.XLNetForSequenceClassification
    :members:
 ``XLNetForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: pytorch_transformers.XLNetForQuestionAnswering
+.. autoclass:: transformers.XLNetForQuestionAnswering
    :members:
 ``TFXLNetModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLNetModel
    :members:
 ``TFXLNetLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLNetLMHeadModel
    :members:
 ``TFXLNetForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLNetForSequenceClassification
    :members:
 ``TFXLNetForQuestionAnsweringSimple``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TFXLNetForQuestionAnsweringSimple
    :members:
--- a/docs/source/notebooks.rst
+++ b/docs/source/notebooks.rst
@@ -1,16 +1,16 @@
 Notebooks
 ================================================
-We include `three Jupyter Notebooks <https://github.com/huggingface/pytorch-transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
+We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
 *
-  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
+  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
 *
-  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
+  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
 *
-  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
+  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
 Please follow the instructions given in the notebooks to run and modify them.
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -44,15 +44,15 @@ Here is the full list of the currently provided pretrained models together with
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
 |                   |                                                            | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                                                             |
-|                   |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`__).   |
+|                   |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/transformers/tree/master/examples>`__).           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-cased-whole-word-masking-finetuned-squad``    | | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                                    |
 |                   |                                                            | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                               |
-|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)                   |
+|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-cased-finetuned-mrpc``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | The ``bert-base-cased`` model fine-tuned on MRPC                                                                                    |
-|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)                   |
+|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | OpenAI GPT English model                                                                                                            |
@@ -120,4 +120,4 @@ Here is the full list of the currently provided pretrained models together with
 |                   |                                                            | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__)                                                            |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-.. <https://huggingface.co/pytorch-transformers/examples.html>`__
+.. <https://huggingface.co/transformers/examples.html>`__
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -2,7 +2,7 @@
 ## Philosophy
-PyTorch-Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
+Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
 The library was designed with two strong goals in mind:
@@ -19,12 +19,12 @@ The library was designed with two strong goals in mind:
 A few other goals:
- expose the models internals as consistently as possible:
+- expose the models' internals as consistently as possible:
  - we give access, using a single API to the full hidden-states and attention weights,
  - tokenizer and base model's API are standardized to easily switch between models.
- incorporate a subjective selection of promising tools for fine-tuning/investiguating these models:
+- incorporate a subjective selection of promising tools for fine-tuning/investigating these models:
  - a simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning,
  - simple ways to mask and prune transformer heads.
@@ -33,13 +33,13 @@ A few other goals:
 The library is build around three type of classes for each models:
- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 6 models architectures currently provided in the library, e.g. `BertModel`
+- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 8 models architectures currently provided in the library, e.g. `BertModel`
 - **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig`. You don't always need to instantiate these your-self, in particular if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model)
 - **tokenizer classes** which store the vocabulary for each model and provide methods for encoding/decoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer`
 All these classes can be instantiated from pretrained instances and saved locally using two methods:
- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/pytorch-transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
+- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
 - `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.
 We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts:
@@ -51,7 +51,7 @@ We'll finish this quickstart tour by going through a few simple quick-start exam
 Here are two examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.
-See full API reference for examples for each model classe.
+See full API reference for examples for each model class.
 ### BERT example
@@ -59,7 +59,7 @@ Let's start by preparing a tokenized input (a list of token embeddings indices t
 ```python
 import torch
-from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
+from transformers import BertTokenizer, BertModel, BertForMaskedLM
 # OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
 import logging
@@ -93,8 +93,8 @@ Let's see how we can use `BertModel` to encode our inputs in hidden-states:
 # Load pre-trained model (weights)
 model = BertModel.from_pretrained('bert-base-uncased')
-# Set the model in evaluation mode to desactivate the DropOut modules
+# Set the model in evaluation mode to deactivate the DropOut modules
-# This is IMPORTANT to have reproductible results during evaluation!
+# This is IMPORTANT to have reproducible results during evaluation!
 model.eval()
 # If you have a GPU, put everything on cuda
@@ -106,7 +106,7 @@ model.to('cuda')
 with torch.no_grad():
    # See the models docstrings for the detail of the inputs
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
-    # PyTorch-Transformers models always output tuples.
+    # Transformers models always output tuples.
    # See the models docstrings for the detail of all the outputs
    # In our case, the first element is the hidden state of the last layer of the Bert model
    encoded_layers = outputs[0]
@@ -145,7 +145,7 @@ First let's prepare a tokenized input from our text string using `GPT2Tokenizer`
 ```python
 import torch
-from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
@@ -168,8 +168,8 @@ Let's see how to use `GPT2LMHeadModel` to generate the next token following our
 # Load pre-trained model (weights)
 model = GPT2LMHeadModel.from_pretrained('gpt2')
-# Set the model in evaluation mode to desactivate the DropOut modules
+# Set the model in evaluation mode to deactivate the DropOut modules
-# This is IMPORTANT to have reproductible results during evaluation!
+# This is IMPORTANT to have reproducible results during evaluation!
 model.eval()
 # If you have a GPU, put everything on cuda
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -45,7 +45,7 @@ where
    * ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
    * ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
-  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
+  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
 *
  ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
@@ -122,7 +122,7 @@ Here is the recommended way of saving the model, configuration and vocabulary to
 .. code-block:: python
-   from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+   from transformers import WEIGHTS_NAME, CONFIG_NAME
   output_dir = "./models/"
--- a/docs/source/torchscript.rst
+++ b/docs/source/torchscript.rst
@@ -12,7 +12,7 @@ According to Pytorch's documentation: "TorchScript is a way to create serializab
 Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
 their model to be re-used in other programs, such as efficiency-oriented C++ programs.
-We have provided an interface that allows the export of `pytorch-transformers` models to TorchScript so that they can
+We have provided an interface that allows the export of `transformers` models to TorchScript so that they can
 be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that
 they can be exported, and what to be mindful of when using these models with TorchScript.
@@ -74,7 +74,7 @@ according to a ``BertConfig`` class and then saved to disk under the filename ``
 .. code-block:: python
-    from pytorch_transformers import BertModel, BertTokenizer, BertConfig
+    from transformers import BertModel, BertTokenizer, BertConfig
    import torch
    enc = BertTokenizer.from_pretrained("bert-base-uncased")
--- a/examples/README.md
+++ b/examples/README.md
@@ -13,7 +13,7 @@ similar API between the different models.
 ## Language model fine-tuning
-Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_lm_finetuning.py).
+Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_lm_finetuning.py).
 Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT 
 to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa 
@@ -75,7 +75,7 @@ python run_lm_finetuning.py \
 ## Language generation
-Based on the script [`run_generation.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_generation.py).
+Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py).
 Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
 A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
@@ -91,26 +91,26 @@ python run_generation.py \
 ## GLUE
-Based on the script [`run_glue.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py).
+Based on the script [`run_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py).
 Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding 
 Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa. 
 GLUE is made up of a total of 9 different tasks. We get the following results on the dev set of the benchmark with an
-uncased  BERT base model (the checkpoint `bert-base-uncased`). All experiments ran on 8  V100 GPUs with a total train
+uncased  BERT base model (the checkpoint `bert-base-uncased`). All experiments ran on 8 V100 GPUs with a total train
 batch size of 24. Some of these tasks have a small dataset and training can lead to high variance in the results
 between different runs. We report the median on 5 runs (with different seeds) for each of the metrics.
 | Task  | Metric                       | Result      |
 |-------|------------------------------|-------------|
-| CoLA  | Matthew's corr               | 55.75       |
+| CoLA  | Matthew's corr               | 48.87       |
-| SST-2 | Accuracy                     | 92.09       |
+| SST-2 | Accuracy                     | 91.74       |
-| MRPC  | F1/Accuracy                  | 90.48/86.27 |
+| MRPC  | F1/Accuracy                  | 90.70/86.27 |
-| STS-B | Person/Spearman corr.        | 89.03/88.64 |
+| STS-B | Person/Spearman corr.        | 91.39/91.04 |
-| QQP   | Accuracy/F1                  | 90.92/87.72 |
+| QQP   | Accuracy/F1                  | 90.79/87.66 |
-| MNLI  | Matched acc./Mismatched acc. | 83.74/84.06 |
+| MNLI  | Matched acc./Mismatched acc. | 83.70/84.83 |
-| QNLI  | Accuracy                     | 91.07       |
+| QNLI  | Accuracy                     | 89.31       |
-| RTE   | Accuracy                     | 68.59       |
+| RTE   | Accuracy                     | 71.43       |
 | WNLI  | Accuracy                     | 43.66       |
 Some of these results are significantly different from the ones reported on the test set
@@ -319,7 +319,7 @@ eval_loss = 0.44457291918821606
 ## SQuAD
-Based on the script [`run_squad.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py).
+Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py).
 #### Fine-tuning on SQuAD
--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@@ -39,7 +39,7 @@ import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
-from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
+from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
                                     AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
                                     WarmupLinearSchedule)
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -35,10 +35,10 @@ from tqdm import tqdm, trange
 from tensorboardX import SummaryWriter
-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                  BertForMultipleChoice, BertTokenizer)
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
 logger = logging.getLogger(__name__)
@@ -365,7 +365,7 @@ def train(args, train_dataset, model, tokenizer):
            #     inputs.update({'cls_index': batch[5],
            #                    'p_mask':       batch[6]})
            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            if args.n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
@@ -647,7 +647,7 @@ def main():
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
--- a/examples/contrib/run_transfo_xl.py
+++ b/examples/contrib/run_transfo_xl.py
@@ -28,7 +28,7 @@ import math
 import torch
-from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
+from transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -2,25 +2,34 @@
 This folder contains the original code used to train DistilBERT as well as examples showcasing how to use DistilBERT.
 **2019, September 19th - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 97% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!
 ## What is DistilBERT
-DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving over 95% of Bert's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
+DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
 For more information on DistilBERT, please refer to our [detailed blog post](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5
-).
+). *Please note that we will publish a formal write-up with updated and more complete results in the near future (September 19th).*
 Here's the updated results on the dev sets of GLUE:
 | Model      | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP | RTE | SST-2 | STS-B | WNLI |
 | :---:      |    :---:    | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:|
 | BERT-base  |  **77.6**   | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 |
 | DistilBERT |  **75.2**   | 49.1 | 81.8 | 90.2 | 87.0 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
 ## Setup
 This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`. 
-**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/pytorch-transformers/issues/1179) for more details.
+**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/transformers/issues/1179) for more details.
 ## How to use DistilBERT
-PyTorch-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
+Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
 - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.2 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
+- `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
 Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
--- a/examples/distillation/dataset.py
+++ b/examples/distillation/dataset.py
@@ -92,11 +92,11 @@ class Dataset:
        Too short sequences are simply removed. This could be tunedd.
        """
        init_size = len(self)
-        indices = self.lengths > 5
+        indices = self.lengths > 11
        self.token_ids = self.token_ids[indices]
        self.lengths = self.lengths[indices]
        new_size = len(self)
-        logger.info(f'Remove {init_size - new_size} too short (<=5 tokens) sequences.')
+        logger.info(f'Remove {init_size - new_size} too short (<=11 tokens) sequences.')
    def print_statistics(self):
        """
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -18,15 +18,18 @@
 import os
 import math
 import psutil
 import time
 from tensorboardX import SummaryWriter
 from tqdm import trange, tqdm
 import numpy as np
 import psutil
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.optim import AdamW
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import WarmupLinearSchedule
 from utils import logger
 from dataset import Dataset
@@ -58,10 +61,12 @@ class Distiller:
        self.alpha_ce = params.alpha_ce
        self.alpha_mlm = params.alpha_mlm
        self.alpha_mse = params.alpha_mse
        self.alpha_cos = params.alpha_cos
        assert self.alpha_ce >= 0.
        assert self.alpha_mlm >= 0.
        assert self.alpha_mse >= 0.
-        assert self.alpha_ce + self.alpha_mlm + self.alpha_mse > 0.
+        assert self.alpha_cos >= 0.
        assert self.alpha_ce + self.alpha_mlm + self.alpha_mse + self.alpha_cos > 0.
        self.mlm_mask_prop = params.mlm_mask_prop
        assert 0.0 <= self.mlm_mask_prop <= 1.0
@@ -81,17 +86,21 @@ class Distiller:
        self.last_loss = 0
        self.last_loss_ce = 0
        self.last_loss_mlm = 0
-        self.last_loss_mse = 0
+        if self.alpha_mse > 0.: self.last_loss_mse = 0
        if self.alpha_cos > 0.: self.last_loss_cos = 0
        self.last_log = 0
        self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
        self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
-        self.mse_loss_fct = nn.MSELoss(reduction='sum')
+        if self.alpha_mse > 0.:
            self.mse_loss_fct = nn.MSELoss(reduction='sum')
        if self.alpha_cos > 0.:
            self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction='mean')
        logger.info('--- Initializing model optimizer')
        assert params.gradient_accumulation_steps >= 1
        self.num_steps_epoch = int(len(self.dataloader) / params.batch_size) + 1
        num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
        warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
@@ -104,9 +113,11 @@ class Distiller:
                               lr=params.learning_rate,
                               eps=params.adam_epsilon,
                               betas=(0.9, 0.98))
        warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
        self.scheduler = WarmupLinearSchedule(self.optimizer,
-                                              warmup_steps=warmup_steps,
+                                                warmup_steps=warmup_steps,
-                                              t_total=num_train_optimization_steps)
+                                                t_total=num_train_optimization_steps)
        if self.fp16:
            try:
@@ -272,11 +283,14 @@ class Distiller:
        The real training loop.
        """
        if self.is_master: logger.info('Starting training')
        self.last_log = time.time()
        self.student.train()
        self.teacher.eval()
        for _ in range(self.params.n_epoch):
            if self.is_master: logger.info(f'--- Starting epoch {self.epoch}/{self.params.n_epoch-1}')
            if self.multi_gpu:
                torch.distributed.barrier()
            iter_bar = trange(self.num_steps_epoch, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
            for __ in range(self.num_steps_epoch):
@@ -314,9 +328,9 @@ class Distiller:
        attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention.
        mlm_labels: `torch.tensor(bs, seq_length)` - The masked language modeling labels.
        """
-        s_logits = self.student(input_ids=input_ids, attention_mask=attention_mask)[0]     # (bs, seq_length, voc_size)
+        s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask)     # (bs, seq_length, voc_size)
        with torch.no_grad():
-            t_logits = self.teacher(input_ids=input_ids, attention_mask=attention_mask)[0] # (bs, seq_length, voc_size)
+            t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
        assert s_logits.size() == t_logits.size()
        #https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
@@ -340,6 +354,22 @@ class Distiller:
        if self.alpha_mse > 0.:
            loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction
            loss += self.alpha_mse * loss_mse
        if self.alpha_cos > 0.:
            s_hidden_states = s_hidden_states[-1]                              # (bs, seq_length, dim)
            t_hidden_states = t_hidden_states[-1]                              # (bs, seq_length, dim)
            mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states)     # (bs, seq_length, dim)
            assert s_hidden_states.size() == t_hidden_states.size()
            dim = s_hidden_states.size(-1)
            s_hidden_states_slct = torch.masked_select(s_hidden_states, mask)        # (bs * seq_length * dim)
            s_hidden_states_slct = s_hidden_states_slct.view(-1, dim)                # (bs * seq_length, dim)
            t_hidden_states_slct = torch.masked_select(t_hidden_states, mask)        # (bs * seq_length * dim)
            t_hidden_states_slct = t_hidden_states_slct.view(-1, dim)                # (bs * seq_length, dim)
            target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,)
            loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target)
            loss += self.alpha_cos * loss_cos
        self.total_loss_epoch += loss.item()
        self.last_loss = loss.item()
@@ -348,6 +378,8 @@ class Distiller:
            self.last_loss_mlm = loss_mlm.item()
        if self.alpha_mse > 0.:
            self.last_loss_mse = loss_mse.item()
        if self.alpha_cos > 0.:
            self.last_loss_cos = loss_cos.item()
        self.optimize(loss)
@@ -396,6 +428,7 @@ class Distiller:
        if self.n_total_iter % self.params.log_interval == 0:
            self.log_tensorboard()
            self.last_log = time.time()
        if self.n_total_iter % self.params.checkpoint_interval == 0:
            self.save_checkpoint()
@@ -421,9 +454,12 @@ class Distiller:
            self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter)
        if self.alpha_mse > 0.:
            self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
        if self.alpha_cos > 0.:
            self.tensorboard.add_scalar(tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter)
        self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter)
        self.tensorboard.add_scalar(tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used']/1_000_000, global_step=self.n_total_iter)
        self.tensorboard.add_scalar(tag="global/speed", scalar_value=time.time()-self.last_log, global_step=self.n_total_iter)
    def end_epoch(self):
        """
--- a/examples/distillation/requirements.txt
+++ b/examples/distillation/requirements.txt
@@ -2,3 +2,5 @@ gitpython==3.0.2
 tensorboard>=1.14.0
 tensorboardX==1.8
 psutil==5.6.3
 scipy==1.3.1
 pytorch_transformers==1.2.0
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -20,7 +20,7 @@ import pickle
 import random
 import time
 import numpy as np
-from pytorch_transformers import BertTokenizer
+from transformers import BertTokenizer, RobertaTokenizer
 import logging
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
@@ -32,16 +32,21 @@ def main():
    parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
    parser.add_argument('--file_path', type=str, default='data/dump.txt',
                        help='The path to the data.')
-    parser.add_argument('--bert_tokenizer', type=str, default='bert-base-uncased',
+    parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta'])
    parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
                        help="The tokenizer to use.")
    parser.add_argument('--dump_file', type=str, default='data/dump',
                        help='The dump file prefix.')
    args = parser.parse_args()
-    logger.info(f'Loading Tokenizer ({args.bert_tokenizer})')
+    logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
-    bert_tokenizer = BertTokenizer.from_pretrained(args.bert_tokenizer)
+    if args.tokenizer_type == 'bert':
-
+        tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
    elif args.tokenizer_type == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
    bos = tokenizer.special_tokens_map['bos_token'] # `[CLS]` for bert, `<s>` for roberta
    sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]` for bert, `</s>` for roberta
    logger.info(f'Loading text from {args.file_path}')
    with open(args.file_path, 'r', encoding='utf8') as fp:
@@ -56,8 +61,8 @@ def main():
    interval = 10000
    start = time.time()
    for text in data:
-        text = f'[CLS] {text.strip()} [SEP]'
+        text = f'{bos} {text.strip()} {sep}'
-        token_ids = bert_tokenizer.encode(text)
+        token_ids = tokenizer.encode(text)
        rslt.append(token_ids)
        iter += 1
@@ -69,7 +74,7 @@ def main():
    logger.info(f'{len(data)} examples processed.')
-    dp_file = f'{args.dump_file}.{args.bert_tokenizer}.pickle'
+    dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle'
    rslt_ = [np.uint16(d) for d in rslt]
    random.shuffle(rslt_)
    logger.info(f'Dump to {dp_file}')
--- a/examples/distillation/scripts/extract_for_distil.py
+++ b/examples/distillation/scripts/extract_for_distil.py
@@ -15,59 +15,73 @@
 """
 Preprocessing script before training DistilBERT.
 """
-from pytorch_transformers import BertForPreTraining
+from transformers import BertForMaskedLM, RobertaForMaskedLM
 import torch
 import argparse
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForPreTraining for Transfer Learned Distillation")
+    parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation")
-    parser.add_argument("--bert_model", default='bert-base-uncased', type=str)
+    parser.add_argument("--model_type", default="bert", choices=["bert", "roberta"])
-    parser.add_argument("--dump_checkpoint", default='serialization_dir/transfer_learning_checkpoint_0247911.pth', type=str)
+    parser.add_argument("--model_name", default='bert-base-uncased', type=str)
    parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str)
    parser.add_argument("--vocab_transform", action='store_true')
    args = parser.parse_args()
-    model = BertForPreTraining.from_pretrained(args.bert_model)
+    if args.model_type == 'bert':
        model = BertForMaskedLM.from_pretrained(args.model_name)
        prefix = 'bert'
    elif args.model_type == 'roberta':
        model = RobertaForMaskedLM.from_pretrained(args.model_name)
        prefix = 'roberta'
    state_dict = model.state_dict()
    compressed_sd = {}
    for w in ['word_embeddings', 'position_embeddings']:
        compressed_sd[f'distilbert.embeddings.{w}.weight'] = \
-            state_dict[f'bert.embeddings.{w}.weight']
+            state_dict[f'{prefix}.embeddings.{w}.weight']
    for w in ['weight', 'bias']:
        compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \
-            state_dict[f'bert.embeddings.LayerNorm.{w}']
+            state_dict[f'{prefix}.embeddings.LayerNorm.{w}']
    std_idx = 0
    for teacher_idx in [0, 2, 4, 7, 9, 11]:
        for w in ['weight', 'bias']:
            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \
-                state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.query.{w}']
+                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}']
            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}'] = \
-                state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.key.{w}']
+                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}']
            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}'] = \
-                state_dict[f'bert.encoder.layer.{teacher_idx}.attention.self.value.{w}']
+                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}']
            compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}'] = \
-                state_dict[f'bert.encoder.layer.{teacher_idx}.attention.output.dense.{w}']
+                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}']
            compressed_sd[f'distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}'] = \
-                state_dict[f'bert.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}']
+                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}']
            compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}'] = \
-                state_dict[f'bert.encoder.layer.{teacher_idx}.intermediate.dense.{w}']
+                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}']
            compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}'] = \
-                state_dict[f'bert.encoder.layer.{teacher_idx}.output.dense.{w}']
+                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}']
            compressed_sd[f'distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}'] = \
-                state_dict[f'bert.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
+                state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
        std_idx += 1
-    compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
+    if args.model_type == 'bert':
-    compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
+        compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
-    if args.vocab_transform:
+        compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
-        for w in ['weight', 'bias']:
+        if args.vocab_transform:
-            compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
+            for w in ['weight', 'bias']:
-            compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
+                compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
                compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
    elif args.model_type == 'roberta':
        compressed_sd[f'vocab_projector.weight'] = state_dict[f'lm_head.decoder.weight']
        compressed_sd[f'vocab_projector.bias'] = state_dict[f'lm_head.bias']
        if args.vocab_transform:
            for w in ['weight', 'bias']:
                compressed_sd[f'vocab_transform.{w}'] = state_dict[f'lm_head.dense.{w}']
                compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}']
    print(f'N layers selected for distillation: {std_idx}')
    print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@@ -23,8 +23,8 @@ import shutil
 import numpy as np
 import torch
-from pytorch_transformers import BertTokenizer, BertForMaskedLM
+from transformers import BertTokenizer, BertForMaskedLM, RobertaTokenizer, RobertaForMaskedLM
-from pytorch_transformers import DistilBertForMaskedLM, DistilBertConfig
+from transformers import DistilBertForMaskedLM, DistilBertConfig
 from distiller import Distiller
 from utils import git_log, logger, init_gpu_params, set_seed
@@ -70,8 +70,10 @@ def main():
                        help="Load student initialization checkpoint.")
    parser.add_argument("--from_pretrained_config", default=None, type=str,
                        help="Load student initialization architecture config.")
-    parser.add_argument("--bert_model", default='bert-base-uncased', type=str,
+    parser.add_argument("--teacher_type", default="bert", choices=["bert", "roberta"],
-                        help="The teacher BERT model.")
+                        help="Teacher type (BERT, RoBERTa).")
    parser.add_argument("--teacher_name", default="bert-base-uncased", type=str,
                        help="The teacher model.")
    parser.add_argument("--temperature", default=2., type=float,
                        help="Temperature for the softmax temperature.")
@@ -81,6 +83,8 @@ def main():
                        help="Linear weight for the MLM loss. Must be >=0.")
    parser.add_argument("--alpha_mse", default=0.0, type=float,
                        help="Linear weight of the MSE loss. Must be >=0.")
    parser.add_argument("--alpha_cos", default=0.0, type=float,
                        help="Linear weight of the cosine embedding loss. Must be >=0.")
    parser.add_argument("--mlm_mask_prop", default=0.15, type=float,
                        help="Proportion of tokens for which we need to make a prediction.")
    parser.add_argument("--word_mask", default=0.8, type=float,
@@ -165,11 +169,14 @@ def main():
    ### TOKENIZER ###
-    bert_tokenizer = BertTokenizer.from_pretrained(args.bert_model)
+    if args.teacher_type == 'bert':
        tokenizer = BertTokenizer.from_pretrained(args.teacher_name)
    elif args.teacher_type == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained(args.teacher_name)
    special_tok_ids = {}
-    for tok_name, tok_symbol in bert_tokenizer.special_tokens_map.items():
+    for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
-        idx = bert_tokenizer.all_special_tokens.index(tok_symbol)
+        idx = tokenizer.all_special_tokens.index(tok_symbol)
-        special_tok_ids[tok_name] = bert_tokenizer.all_special_ids[idx]
+        special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
    logger.info(f'Special tokens {special_tok_ids}')
    args.special_tok_ids = special_tok_ids
@@ -197,16 +204,17 @@ def main():
    ## STUDENT ##
    if args.from_pretrained_weights is not None:
-        assert os.path.isfile(os.path.join(args.from_pretrained_weights))
+        assert os.path.isfile(args.from_pretrained_weights)
-        assert os.path.isfile(os.path.join(args.from_pretrained_config))
+        assert os.path.isfile(args.from_pretrained_config)
        logger.info(f'Loading pretrained weights from {args.from_pretrained_weights}')
        logger.info(f'Loading pretrained config from {args.from_pretrained_config}')
        stu_architecture_config = DistilBertConfig.from_json_file(args.from_pretrained_config)
        stu_architecture_config.output_hidden_states = True
        student = DistilBertForMaskedLM.from_pretrained(args.from_pretrained_weights,
-                                                     config=stu_architecture_config)
+                                                        config=stu_architecture_config)
    else:
        args.vocab_size_or_config_json_file = args.vocab_size
-        stu_architecture_config = DistilBertConfig(**vars(args))
+        stu_architecture_config = DistilBertConfig(**vars(args), output_hidden_states=True)
        student = DistilBertForMaskedLM(stu_architecture_config)
@@ -216,10 +224,13 @@ def main():
    ## TEACHER ##
-    teacher = BertForMaskedLM.from_pretrained(args.bert_model)
+    if args.teacher_type == 'bert':
        teacher = BertForMaskedLM.from_pretrained(args.teacher_name, output_hidden_states=True)
    elif args.teacher_type == 'roberta':
        teacher = RobertaForMaskedLM.from_pretrained(args.teacher_name, output_hidden_states=True)
    if args.n_gpu > 0:
        teacher.to(f'cuda:{args.local_rank}')
-    logger.info(f'Teacher loaded from {args.bert_model}.')
+    logger.info(f'Teacher loaded from {args.teacher_name}.')
    ## DISTILLER ##
    torch.cuda.empty_cache()
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -32,7 +32,7 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse
 from torch.utils.data.distributed import DistributedSampler
 from torch.nn import CrossEntropyLoss, MSELoss
-from pytorch_transformers import (WEIGHTS_NAME,
+from transformers import (WEIGHTS_NAME,
                                  BertConfig, BertForSequenceClassification, BertTokenizer,
                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer,
                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -26,12 +26,12 @@ import torch
 import torch.nn.functional as F
 import numpy as np
-from pytorch_transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig
+from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig
-from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
-from pytorch_transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
+from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
-from pytorch_transformers import XLNetLMHeadModel, XLNetTokenizer
+from transformers import XLNetLMHeadModel, XLNetTokenizer
-from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
+from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -31,7 +31,7 @@ from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                  BertForSequenceClassification, BertTokenizer,
                                  RobertaConfig,
                                  RobertaForSequenceClassification,
@@ -39,12 +39,17 @@ from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
                                  XLMConfig, XLMForSequenceClassification,
                                  XLMTokenizer, XLNetConfig,
                                  XLNetForSequenceClassification,
-                                  XLNetTokenizer)
+                                  XLNetTokenizer,
                                  DistilBertConfig,
                                  DistilBertForSequenceClassification,
                                  DistilBertTokenizer)
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
-from utils_glue import (compute_metrics, convert_examples_to_features,
+from transformers import glue_compute_metrics as compute_metrics
-                        output_modes, processors)
+from transformers import glue_output_modes as output_modes
 from transformers import glue_processors as processors
 from transformers import glue_convert_examples_to_features as convert_examples_to_features
 logger = logging.getLogger(__name__)
@@ -55,6 +60,7 @@ MODEL_CLASSES = {
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
 }
@@ -128,10 +134,11 @@ def train(args, train_dataset, model, tokenizer):
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM and RoBERTa don't use segment_ids
                      'labels':         batch[3]}
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            if args.n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu parallel training
@@ -148,8 +155,8 @@ def train(args, train_dataset, model, tokenizer):
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
@@ -218,8 +225,9 @@ def evaluate(args, model, tokenizer, prefix=""):
            with torch.no_grad():
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM and RoBERTa don't use segment_ids
                          'labels':         batch[3]}
                if args.model_type != 'distilbert':
                    inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]
@@ -272,15 +280,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1] 
        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
+        features = convert_examples_to_features(examples,
-            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
+                                                tokenizer,
-            cls_token=tokenizer.cls_token,
+                                                label_list=label_list,
-            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
+                                                max_length=args.max_seq_length,
-            sep_token=tokenizer.sep_token,
+                                                output_mode=output_mode,
-            sep_token_extra=bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
+                                                pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
+                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+                                                pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
@@ -291,14 +298,14 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
-        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    elif output_mode == "regression":
-        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
-    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset
@@ -478,7 +485,7 @@ def main():
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -35,11 +35,12 @@ from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
-from pytorch_transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
+from transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
                                  BertConfig, BertForMaskedLM, BertTokenizer,
                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
-                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
+                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
                                  DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
 logger = logging.getLogger(__name__)
@@ -49,7 +50,8 @@ MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
-    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
+    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
 }
@@ -57,7 +59,7 @@ class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path='train', block_size=512):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
-        cached_features_file = os.path.join(directory, f'cached_lm_{block_size}_{filename}')
+        cached_features_file = os.path.join(directory, 'cached_lm_{}_{}'.format(block_size, filename))
        if os.path.exists(cached_features_file):
            logger.info("Loading features from cached file %s", cached_features_file)
@@ -72,9 +74,8 @@ class TextDataset(Dataset):
            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
-            while len(tokenized_text) >= block_size:  # Truncate in block of block_size
+            for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
-                self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size]))
+                self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[i:i+block_size]))
                tokenized_text = tokenized_text[block_size:]
            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
            # If your dataset is small, first you should loook for a bigger one :-) and second you
            # can change this behavior by adding (model specific) padding.
@@ -186,7 +187,7 @@ def train(args, train_dataset, model, tokenizer):
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
@@ -380,7 +381,7 @@ def main():
    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
    args = parser.parse_args()
-    if args.model_type in ["bert", "roberta"] and not args.mlm:
+    if args.model_type in ["bert", "roberta", "distilbert"] and not args.mlm:
        raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
                         "flag (masked language modeling).")
    if args.eval_data_file is None and args.do_eval:
@@ -479,7 +480,7 @@ def main():
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -32,13 +32,13 @@ from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                  BertForMultipleChoice, BertTokenizer,
                                  XLNetConfig, XLNetForMultipleChoice,
                                  XLNetTokenizer, RobertaConfig,
                                  RobertaForMultipleChoice, RobertaTokenizer)
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
 from utils_multiple_choice import (convert_examples_to_features, processors)
@@ -141,7 +141,7 @@ def train(args, train_dataset, model, tokenizer):
                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                      'labels':         batch[3]}
            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            if args.n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu parallel training
@@ -508,7 +508,7 @@ def main():
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
@@ -524,7 +524,7 @@ def main():
        checkpoints = [args.output_dir]
        # if args.eval_all_checkpoints: # can not use this to do test!!
        #     checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-        #     logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        #     logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -32,14 +32,15 @@ from tqdm import tqdm, trange
 from tensorboardX import SummaryWriter
-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                  BertForQuestionAnswering, BertTokenizer,
                                  XLMConfig, XLMForQuestionAnswering,
                                  XLMTokenizer, XLNetConfig,
                                  XLNetForQuestionAnswering,
-                                  XLNetTokenizer)
+                                  XLNetTokenizer,
                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
 from utils_squad import (read_squad_examples, convert_examples_to_features,
                         RawResult, write_predictions,
@@ -59,6 +60,7 @@ MODEL_CLASSES = {
    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
 }
 def set_seed(args):
@@ -140,7 +142,7 @@ def train(args, train_dataset, model, tokenizer):
                inputs.update({'cls_index': batch[5],
                               'p_mask':       batch[6]})
            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            if args.n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
@@ -508,7 +510,7 @@ def main():
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -0,0 +1,40 @@
 import tensorflow as tf
 import tensorflow_datasets
 from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features, BertForSequenceClassification
 # Load dataset, tokenizer, model from pretrained model/vocabulary
 tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
 model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
 data = tensorflow_datasets.load('glue/mrpc')
 # Prepare dataset for GLUE as a tf.data.Dataset instance
 train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc')
 valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc')
 train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
 valid_dataset = valid_dataset.batch(64)
 # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
 optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
 loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
 model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
 # Train and evaluate using tf.keras.Model.fit()
 history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
                    validation_data=valid_dataset, validation_steps=7)
 # Load the TensorFlow model in PyTorch for inspection
 model.save_pretrained('./save/')
 pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
 # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
 sentence_0 = "This research was consistent with his findings."
 sentence_1 = "His findings were compatible with this research."
 sentence_2 = "His findings were not compatible with this research."
 inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
 inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
 pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
 pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
 print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
 print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
--- a/examples/utils_squad.py
+++ b/examples/utils_squad.py
@@ -24,7 +24,7 @@ import math
 import collections
 from io import open
-from pytorch_transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
+from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
 # Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
 from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,7 +1,7 @@
-from pytorch_transformers import (
+from transformers import (
    AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
 )
-from pytorch_transformers.file_utils import add_start_docstrings
+from transformers.file_utils import add_start_docstrings
 dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
@@ -11,12 +11,12 @@ def config(*args, **kwargs):
                # Using torch.hub !
                import torch
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
+                config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+                config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
+                config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/my_configuration.json')
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
+                config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
                assert config.output_attention == True
-                config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
+                config, unused_kwargs = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
                assert config.output_attention == True
                assert unused_kwargs == {'foo': False}
@@ -31,8 +31,8 @@ def tokenizer(*args, **kwargs):
        # Using torch.hub !
        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
+        tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+        tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
    """
@@ -45,13 +45,13 @@ def model(*args, **kwargs):
            # Using torch.hub !
            import torch
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            model = torch.hub.load('huggingface/transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
        """
@@ -63,13 +63,13 @@ def modelWithLMHead(*args, **kwargs):
        # Using torch.hub !
        import torch
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
        assert model.config.output_attention == True
        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
    """
    return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
@@ -81,13 +81,13 @@ def modelForSequenceClassification(*args, **kwargs):
            # Using torch.hub !
            import torch
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
        """
@@ -100,13 +100,13 @@ def modelForQuestionAnswering(*args, **kwargs):
        # Using torch.hub !
        import torch
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
        assert model.config.output_attention == True
        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
    """
    return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
--- a/pytorch_transformers/init.py
+++ b/pytorch_transformers/init.py
@@ -1,75 +0,0 @@
 __version__ = "1.2.0"
 # Work around to update TensorFlow's absl.logging threshold which alters the
 # default Python logging output behavior when present.
 # see: https://github.com/abseil/abseil-py/issues/99
 # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
 try:
    import absl.logging
    absl.logging.set_verbosity('info')
    absl.logging.set_stderrthreshold('info')
    absl.logging._warn_preinit_stderr = False
 except:
    pass
 # Tokenizer
 from .tokenization_utils import (PreTrainedTokenizer)
 from .tokenization_auto import AutoTokenizer
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_roberta import RobertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
 # Configurations
 from .configuration_utils import PretrainedConfig
 from .configuration_auto import AutoConfig
 from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 # Modeling
 from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
 from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
                            AutoModelWithLMHead)
 from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
                            BertForMaskedLM, BertForNextSentencePrediction,
                            BertForSequenceClassification, BertForMultipleChoice,
                            BertForTokenClassification, BertForQuestionAnswering,
                            load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
                              OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
                              load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
                                  load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
                            GPT2LMHeadModel, GPT2DoubleHeadsModel,
                            load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
                             XLNetForSequenceClassification, XLNetForQuestionAnswering, XLNetForMultipleChoice,
                             load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
                           XLMWithLMHeadModel, XLMForSequenceClassification,
                           XLMForQuestionAnswering, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
                               RobertaForMultipleChoice, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
                               DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
                               DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
 # Optimization
 from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
                           WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
 # Files and general utilities
 from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
                         cached_path, add_start_docstrings, add_end_docstrings,
                         WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,3 @@
 # PyTorch
 torch>=1.0.0
 # progress bars in model download and training scripts
 tqdm
 # Accessing files from S3 directly.
--- a/setup.py
+++ b/setup.py
@@ -13,11 +13,11 @@ To create the package for pypi.
 4. Build both the sources and the wheel. Do not change anything in setup.py between
   creating the wheel and the source distribution (obviously).
-   For the wheel, run: "python setup.py bdist_wheel" in the top level allennlp directory.
+   For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
   (this will build a wheel for the python version you use to build it - make sure you use python 3.x).
   For the sources, run: "python setup.py sdist"
-   You should now have a /dist directory with both .whl and .tar.gz source versions of allennlp.
+   You should now have a /dist directory with both .whl and .tar.gz source versions.
 5. Check that everything looks correct by uploading the package to the pypi test server:
@@ -25,7 +25,7 @@ To create the package for pypi.
   (pypi suggest using twine as other methods upload files via plaintext.)
   Check that you can install it in a virtualenv by running:
-   pip install -i https://testpypi.python.org/pypi pytorch-transformers
+   pip install -i https://testpypi.python.org/pypi transformers
 6. Upload the final version to actual pypi:
   twine upload dist/* -r pypi
@@ -37,20 +37,19 @@ from io import open
 from setuptools import find_packages, setup
 setup(
-    name="pytorch_transformers",
+    name="transformers",
-    version="1.2.0",
+    version="2.0.0",
-    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors",
+    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
    author_email="thomas@huggingface.co",
-    description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",
+    description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
    long_description=open("README.md", "r", encoding='utf-8').read(),
    long_description_content_type="text/markdown",
-    keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU',
+    keywords='NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU',
    license='Apache',
-    url="https://github.com/huggingface/pytorch-transformers",
+    url="https://github.com/huggingface/transformers",
    packages=find_packages(exclude=["*.tests", "*.tests.*",
                                    "tests.*", "tests"]),
-    install_requires=['torch>=1.0.0',
+    install_requires=['numpy',
                      'numpy',
                      'boto3',
                      'requests',
                      'tqdm',
@@ -59,7 +58,7 @@ setup(
                      'sacremoses'],
    entry_points={
      'console_scripts': [
-        "pytorch_transformers=pytorch_transformers.__main__:main",
+        "transformers=transformers.__main__:main",
      ]
    },
    # python_requires='>=3.5.0',
--- a/transformers/init.py
+++ b/transformers/init.py
@@ -0,0 +1,165 @@
 __version__ = "2.0.0"
 # Work around to update TensorFlow's absl.logging threshold which alters the
 # default Python logging output behavior when present.
 # see: https://github.com/abseil/abseil-py/issues/99
 # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
 try:
    import absl.logging
    absl.logging.set_verbosity('info')
    absl.logging.set_stderrthreshold('info')
    absl.logging._warn_preinit_stderr = False
 except:
    pass
 import logging
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 # Files and general utilities
 from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
                         cached_path, add_start_docstrings, add_end_docstrings,
                         WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME,
                         is_tf_available, is_torch_available)
 from .data import (is_sklearn_available,
                   InputExample, InputFeatures, DataProcessor,
                   glue_output_modes, glue_convert_examples_to_features,
                   glue_processors, glue_tasks_num_labels)
 if is_sklearn_available():
    from .data import glue_compute_metrics
 # Tokenizers
 from .tokenization_utils import (PreTrainedTokenizer)
 from .tokenization_auto import AutoTokenizer
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_roberta import RobertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
 # Configurations
 from .configuration_utils import PretrainedConfig
 from .configuration_auto import AutoConfig
 from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 # Modeling
 if is_torch_available():
    from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
    from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
                                AutoModelWithLMHead)
    from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
                                BertForMaskedLM, BertForNextSentencePrediction,
                                BertForSequenceClassification, BertForMultipleChoice,
                                BertForTokenClassification, BertForQuestionAnswering,
                                load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
                                OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
                                load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
                                    load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
                                GPT2LMHeadModel, GPT2DoubleHeadsModel,
                                load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
                                XLNetForSequenceClassification, XLNetForMultipleChoice,
                                XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering,
                                load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
                            XLMWithLMHeadModel, XLMForSequenceClassification,
                            XLMForQuestionAnswering, XLMForQuestionAnsweringSimple,
                            XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_roberta import (RobertaForMaskedLM, RobertaModel,
                                RobertaForSequenceClassification, RobertaForMultipleChoice,
                                ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
                                DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
                                DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
    # Optimization
    from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
                               WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
 # TensorFlow
 if is_tf_available():
    from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary
    from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
                                   TFAutoModelWithLMHead)
    from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings,
                                   TFBertModel, TFBertForPreTraining,
                                   TFBertForMaskedLM, TFBertForNextSentencePrediction,
                                   TFBertForSequenceClassification, TFBertForMultipleChoice,
                                   TFBertForTokenClassification, TFBertForQuestionAnswering,
                                   load_bert_pt_weights_in_tf2,
                                   TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_gpt2 import (TFGPT2PreTrainedModel, TFGPT2MainLayer,
                                   TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel,
                                   load_gpt2_pt_weights_in_tf2,
                                   TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_openai import (TFOpenAIGPTPreTrainedModel, TFOpenAIGPTMainLayer,
                                     TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel,
                                     load_openai_gpt_pt_weights_in_tf2,
                                     TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_transfo_xl import (TFTransfoXLPreTrainedModel, TFTransfoXLMainLayer,
                                         TFTransfoXLModel, TFTransfoXLLMHeadModel,
                                         load_transfo_xl_pt_weights_in_tf2,
                                         TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer,
                                    TFXLNetModel, TFXLNetLMHeadModel,
                                    TFXLNetForSequenceClassification,
                                    TFXLNetForQuestionAnsweringSimple,
                                    load_xlnet_pt_weights_in_tf2,
                                    TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_xlm import (TFXLMPreTrainedModel, TFXLMMainLayer,
                                  TFXLMModel, TFXLMWithLMHeadModel,
                                  TFXLMForSequenceClassification,
                                  TFXLMForQuestionAnsweringSimple,
                                  load_xlm_pt_weights_in_tf2,
                                  TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_roberta import (TFRobertaPreTrainedModel, TFRobertaMainLayer,
                                      TFRobertaModel, TFRobertaForMaskedLM,
                                      TFRobertaForSequenceClassification,
                                      load_roberta_pt_weights_in_tf2,
                                      TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
    from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
                                         TFDistilBertModel, TFDistilBertForMaskedLM,
                                         TFDistilBertForSequenceClassification,
                                         TFDistilBertForQuestionAnswering,
                                         load_distilbert_pt_weights_in_tf2,
                                         TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
 # TF 2.0 <=> PyTorch conversion utilities
 if is_tf_available() and is_torch_available():
    from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
                                            load_pytorch_checkpoint_in_tf2_model,
                                            load_pytorch_weights_in_tf2_model,
                                            load_pytorch_model_in_tf2_model,
                                            load_tf2_checkpoint_in_pytorch_model,
                                            load_tf2_weights_in_pytorch_model,
                                            load_tf2_model_in_pytorch_model)
 if not is_tf_available() and not is_torch_available():
    logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found."
                   "Models won't be available and only tokenizers, configuration"
                   "and file/data utilities can be used.")
--- a/pytorch_transformers/main.py
+++ b/pytorch_transformers/main.py
@@ -3,36 +3,37 @@ def main():
    import sys
    if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
        print(
-        "Should be used as one of: \n"
+        "This command line utility let you convert original (author released) model checkpoint to pytorch.\n"
-        ">> pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
+        "It should be used as one of: \n"
-        ">> pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
+        ">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
-        ">> pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
+        ">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
-        ">> pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
+        ">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
-        ">> pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
+        ">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
-        ">> pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
+        ">> transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
        ">> transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
    else:
        if sys.argv[1] == "bert":
            try:
-                from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+                from .convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
            except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                    "In that case, it requires TensorFlow to be installed. Please see "
                    "https://www.tensorflow.org/install/ for installation instructions.")
                raise
            if len(sys.argv) != 5:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+                print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
            else:
                PYTORCH_DUMP_OUTPUT = sys.argv.pop()
                TF_CONFIG = sys.argv.pop()
                TF_CHECKPOINT = sys.argv.pop()
                convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
        elif sys.argv[1] == "gpt":
-            from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
+            from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
            if len(sys.argv) < 4 or len(sys.argv) > 5:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
+                print("Should be used as `transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
            else:
                OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
                PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -45,15 +46,15 @@ def main():
                                                    PYTORCH_DUMP_OUTPUT)
        elif sys.argv[1] == "transfo_xl":
            try:
-                from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
+                from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
            except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                    "In that case, it requires TensorFlow to be installed. Please see "
                    "https://www.tensorflow.org/install/ for installation instructions.")
                raise
            if len(sys.argv) < 4 or len(sys.argv) > 5:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+                print("Should be used as `transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
            else:
                if 'ckpt' in sys.argv[2].lower():
                    TF_CHECKPOINT = sys.argv[2]
@@ -69,16 +70,16 @@ def main():
                convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
        elif sys.argv[1] == "gpt2":
            try:
-                from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
+                from .convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
            except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                    "In that case, it requires TensorFlow to be installed. Please see "
                    "https://www.tensorflow.org/install/ for installation instructions.")
                raise
            if len(sys.argv) < 4 or len(sys.argv) > 5:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+                print("Should be used as `transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
            else:
                TF_CHECKPOINT = sys.argv[2]
                PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -89,16 +90,16 @@ def main():
                convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
        elif sys.argv[1] == "xlnet":
            try:
-                from .convert_xlnet_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
+                from .convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
            except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                    "In that case, it requires TensorFlow to be installed. Please see "
                    "https://www.tensorflow.org/install/ for installation instructions.")
                raise
            if len(sys.argv) < 5 or len(sys.argv) > 6:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
+                print("Should be used as `transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
            else:
                TF_CHECKPOINT = sys.argv[2]
                TF_CONFIG = sys.argv[3]
@@ -113,11 +114,11 @@ def main():
                                                    PYTORCH_DUMP_OUTPUT,
                                                    FINETUNING_TASK)
        elif sys.argv[1] == "xlm":
-            from .convert_xlm_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
+            from .convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
            if len(sys.argv) != 4:
                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
+                print("Should be used as `transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
            else:
                XLM_CHECKPOINT_PATH = sys.argv[2]
                PYTORCH_DUMP_OUTPUT = sys.argv[3]
--- a/pytorch_transformers/configuration_auto.py
+++ b/pytorch_transformers/configuration_auto.py
@@ -31,7 +31,7 @@ logger = logging.getLogger(__name__)
 class AutoConfig(object):
-    r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
+    r""":class:`~transformers.AutoConfig` is a generic configuration class
        that will be instantiated as one of the configuration classes of the library
        when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
        class method.
@@ -76,7 +76,7 @@ class AutoConfig(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
            cache_dir: (`optional`) string:
--- a/pytorch_transformers/configuration_bert.py
+++ b/pytorch_transformers/configuration_bert.py
@@ -45,7 +45,7 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class BertConfig(PretrainedConfig):
    r"""
-        :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
+        :class:`~transformers.BertConfig` is the configuration class to store the configuration of a
        `BertModel`.
@@ -58,7 +58,7 @@ class BertConfig(PretrainedConfig):
            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
                layer in the Transformer encoder.
            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
            hidden_dropout_prob: The dropout probabilitiy for all fully connected
                layers in the embeddings, encoder, and pooler.
            attention_probs_dropout_prob: The dropout ratio for the attention
--- a/pytorch_transformers/configuration_distilbert.py
+++ b/pytorch_transformers/configuration_distilbert.py
@@ -37,7 +37,7 @@ class DistilBertConfig(PretrainedConfig):
    def __init__(self,
                 vocab_size_or_config_json_file=30522,
                 max_position_embeddings=512,
-                 sinusoidal_pos_embds=True,
+                 sinusoidal_pos_embds=False,
                 n_layers=6,
                 n_heads=12,
                 dim=768,
--- a/pytorch_transformers/configuration_gpt2.py
+++ b/pytorch_transformers/configuration_gpt2.py
--- a/pytorch_transformers/configuration_openai.py
+++ b/pytorch_transformers/configuration_openai.py
@@ -36,7 +36,6 @@ class OpenAIGPTConfig(PretrainedConfig):
    Args:
        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
        n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
        n_positions: Number of positional embeddings.
        n_ctx: Size of the causal mask (usually same as n_positions).
        n_embd: Dimensionality of the embeddings and hidden states.
--- a/pytorch_transformers/configuration_roberta.py
+++ b/pytorch_transformers/configuration_roberta.py
--- a/pytorch_transformers/configuration_transfo_xl.py
+++ b/pytorch_transformers/configuration_transfo_xl.py
@@ -95,10 +95,43 @@ class TransfoXLConfig(PretrainedConfig):
                 init_range=0.01,
                 proj_init_std=0.01,
                 init_std=0.02,
                 layer_norm_epsilon=1e-5,
                 **kwargs):
        """Constructs TransfoXLConfig.
        """
        super(TransfoXLConfig, self).__init__(**kwargs)
        self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
        self.cutoffs = []
        self.cutoffs.extend(cutoffs)
        self.tie_weight = tie_weight
        if proj_share_all_but_first:
            self.tie_projs = [False] + [True] * len(self.cutoffs)
        else:
            self.tie_projs = [False] + [False] * len(self.cutoffs)
        self.d_model = d_model
        self.d_embed = d_embed
        self.d_head = d_head
        self.d_inner = d_inner
        self.div_val = div_val
        self.pre_lnorm = pre_lnorm
        self.n_layer = n_layer
        self.n_head = n_head
        self.tgt_len = tgt_len
        self.ext_len = ext_len
        self.mem_len = mem_len
        self.same_length = same_length
        self.attn_type = attn_type
        self.clamp_len = clamp_len
        self.sample_softmax = sample_softmax
        self.adaptive = adaptive
        self.dropout = dropout
        self.dropatt = dropatt
        self.untie_r = untie_r
        self.init = init
        self.init_range = init_range
        self.proj_init_std = proj_init_std
        self.init_std = init_std
        self.layer_norm_epsilon = layer_norm_epsilon
        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                        and isinstance(vocab_size_or_config_json_file, unicode)):
@@ -106,39 +139,7 @@ class TransfoXLConfig(PretrainedConfig):
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
+        elif not isinstance(vocab_size_or_config_json_file, int):
            self.n_token = vocab_size_or_config_json_file
            self.cutoffs = []
            self.cutoffs.extend(cutoffs)
            self.tie_weight = tie_weight
            if proj_share_all_but_first:
                self.tie_projs = [False] + [True] * len(self.cutoffs)
            else:
                self.tie_projs = [False] + [False] * len(self.cutoffs)
            self.d_model = d_model
            self.d_embed = d_embed
            self.d_head = d_head
            self.d_inner = d_inner
            self.div_val = div_val
            self.pre_lnorm = pre_lnorm
            self.n_layer = n_layer
            self.n_head = n_head
            self.tgt_len = tgt_len
            self.ext_len = ext_len
            self.mem_len = mem_len
            self.same_length = same_length
            self.attn_type = attn_type
            self.clamp_len = clamp_len
            self.sample_softmax = sample_softmax
            self.adaptive = adaptive
            self.dropout = dropout
            self.dropatt = dropatt
            self.untie_r = untie_r
            self.init = init
            self.init_range = init_range
            self.proj_init_std = proj_init_std
            self.init_std = init_std
        else:
            raise ValueError("First argument must be either a vocabulary size (int)"
                             " or the path to a pretrained model config file (str)")
--- a/pytorch_transformers/configuration_utils.py
+++ b/pytorch_transformers/configuration_utils.py
@@ -54,11 +54,12 @@ class PretrainedConfig(object):
        self.output_attentions = kwargs.pop('output_attentions', False)
        self.output_hidden_states = kwargs.pop('output_hidden_states', False)
        self.torchscript = kwargs.pop('torchscript', False)
        self.use_bfloat16 = kwargs.pop('use_bfloat16', False)
        self.pruned_heads = kwargs.pop('pruned_heads', {})
    def save_pretrained(self, save_directory):
        """ Save a configuration object to the directory `save_directory`, so that it
-            can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
+            can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
        """
        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
@@ -66,16 +67,17 @@ class PretrainedConfig(object):
        output_config_file = os.path.join(save_directory, CONFIG_NAME)
        self.to_json_file(output_config_file)
        logger.info("Configuration saved in {}".format(output_config_file))
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
+        r""" Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
        Parameters:
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
            cache_dir: (`optional`) string:
@@ -174,7 +176,7 @@ class PretrainedConfig(object):
        """Constructs a `Config` from a Python dictionary of parameters."""
        config = cls(vocab_size_or_config_json_file=-1)
        for key, value in json_object.items():
-            config.__dict__[key] = value
+            setattr(config, key, value)
        return config
    @classmethod
--- a/pytorch_transformers/configuration_xlm.py
+++ b/pytorch_transformers/configuration_xlm.py
@@ -56,8 +56,6 @@ class XLMConfig(PretrainedConfig):
        dropout: The dropout probabilitiy for all fully connected
            layers in the embeddings, encoder, and pooler.
        dropatt: The dropout ratio for the attention
            probabilities.
        max_position_embeddings: The maximum sequence length that this model might
            ever be used with. Typically set this to something large just in case
            (e.g., 512 or 1024 or 2048).
@@ -66,7 +64,6 @@ class XLMConfig(PretrainedConfig):
        layer_norm_eps: The epsilon used by LayerNorm.
        dropout: float, dropout rate.
        dropatt: float, dropout rate on attention probabilities.
        init: str, the initialization scheme, either "normal" or "uniform".
        init_range: float, initialize the parameters with a uniform distribution
            in [-init_range, init_range]. Only effective when init="uniform".
--- a/pytorch_transformers/configuration_xlnet.py
+++ b/pytorch_transformers/configuration_xlnet.py
@@ -49,14 +49,11 @@ class XLNetConfig(PretrainedConfig):
        dropout: The dropout probabilitiy for all fully connected
            layers in the embeddings, encoder, and pooler.
        dropatt: The dropout ratio for the attention
            probabilities.
        initializer_range: The sttdev of the truncated_normal_initializer for
            initializing all weight matrices.
        layer_norm_eps: The epsilon used by LayerNorm.
        dropout: float, dropout rate.
        dropatt: float, dropout rate on attention probabilities.
        init: str, the initialization scheme, either "normal" or "uniform".
        init_range: float, initialize the parameters with a uniform distribution
            in [-init_range, init_range]. Only effective when init="uniform".
@@ -80,6 +77,7 @@ class XLNetConfig(PretrainedConfig):
                 n_layer=24,
                 n_head=16,
                 d_inner=4096,
                 max_position_embeddings=512,
                 ff_activation="gelu",
                 untie_r=True,
                 attn_type="bi",
@@ -112,7 +110,7 @@ class XLNetConfig(PretrainedConfig):
            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
-                self.__dict__[key] = value
+                setattr(config, key, value)
        elif isinstance(vocab_size_or_config_json_file, int):
            self.n_token = vocab_size_or_config_json_file
            self.d_model = d_model
--- a/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import argparse
 import torch
-from pytorch_transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
 import logging
 logging.basicConfig(level=logging.INFO)
--- a/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
+++ b/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
@@ -20,7 +20,7 @@ import argparse
 import torch
 import numpy as np
 import tensorflow as tf
-from pytorch_transformers import BertModel
+from transformers import BertModel
 def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
--- a/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from io import open
 import torch
-from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
+from transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                     GPT2Config,
                                                     GPT2Model,
                                                     load_tf_weights_in_gpt2)
--- a/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from io import open
 import torch
-from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
+from transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                     OpenAIGPTConfig,
                                                     OpenAIGPTModel,
                                                     load_tf_weights_in_openai_gpt)
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -0,0 +1,233 @@
 # coding=utf-8
 # Copyright 2018 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Convert pytorch checkpoints to TensorFlow """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import os
 import argparse
 import tensorflow as tf
 from transformers import is_torch_available, cached_path
 from transformers import (BertConfig, TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, load_bert_pt_weights_in_tf2, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  TransfoXLConfig, TFTransfoXLLMHeadModel, load_transfo_xl_pt_weights_in_tf2, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, load_openai_gpt_pt_weights_in_tf2, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, load_roberta_pt_weights_in_tf2, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                  DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, load_distilbert_pt_weights_in_tf2, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
 if is_torch_available():
    import torch
    import numpy as np
    from transformers import (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
                                      DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
 else:
    (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
    GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
    XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
    XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
    TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
    OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
    RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
    DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,) = (
        None, None, None, None,
        None, None,
        None, None,
        None, None,
        None, None,
        None, None,
        None, None, None,
        None, None, None,)
 import logging
 logging.basicConfig(level=logging.INFO)
 MODEL_CLASSES = {
    'bert': (BertConfig, TFBertForPreTraining, load_bert_pt_weights_in_tf2, BertForPreTraining, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'bert-large-uncased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, load_bert_pt_weights_in_tf2, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'bert-large-cased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, load_bert_pt_weights_in_tf2, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'bert-base-cased-finetuned-mrpc': (BertConfig, TFBertForSequenceClassification, load_bert_pt_weights_in_tf2, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'gpt2': (GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'xlnet': (XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'xlm': (XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'transfo-xl': (TransfoXLConfig, TFTransfoXLLMHeadModel, load_transfo_xl_pt_weights_in_tf2, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'openai-gpt': (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, load_openai_gpt_pt_weights_in_tf2, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'roberta': (RobertaConfig, TFRobertaForMaskedLM, load_roberta_pt_weights_in_tf2, RobertaForMaskedLM, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, load_roberta_pt_weights_in_tf2, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, load_distilbert_pt_weights_in_tf2, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
    'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, load_distilbert_pt_weights_in_tf2, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
 }
 def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
    if model_type not in MODEL_CLASSES:
        raise ValueError("Unrecognized model type, should be one of {}.".format(list(MODEL_CLASSES.keys())))
    config_class, model_class, loading_fct, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
    # Initialise TF model
    if config_file in aws_config_map:
        config_file = cached_path(aws_config_map[config_file], force_download=not use_cached_models)
    config = config_class.from_json_file(config_file)
    config.output_hidden_states = True
    config.output_attentions = True
    print("Building TensorFlow model from configuration: {}".format(str(config)))
    tf_model = model_class(config)
    # Load weights from tf checkpoint
    if pytorch_checkpoint_path in aws_model_maps:
        pytorch_checkpoint_path = cached_path(aws_model_maps[pytorch_checkpoint_path], force_download=not use_cached_models)
    tf_model = loading_fct(tf_model, pytorch_checkpoint_path)
    if compare_with_pt_model:
        inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
        tf_inputs = tf.constant(inputs_list)
        tfo = tf_model(tf_inputs, training=False)  # build the network
        pt_model = pt_model_class.from_pretrained(None,
                                                  config=config,
                                                  state_dict=torch.load(pytorch_checkpoint_path,
                                                                        map_location='cpu'))
        pt_inputs = torch.tensor(inputs_list)
        with torch.no_grad():
            pto = pt_model(pt_inputs)
        np_pt = pto[0].detach().numpy()
        np_tf = tfo[0].numpy()
        diff = np.amax(np.abs(np_pt - np_tf))
        print("Max absolute difference between models outputs {}".format(diff))
        assert diff <= 2e-2, "Error, model absolute difference is >2e-2"
    # Save pytorch-model
    print("Save TensorFlow model to {}".format(tf_dump_path))
    tf_model.save_weights(tf_dump_path, save_format='h5')
 def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
                                     compare_with_pt_model=False, use_cached_models=False, only_convert_finetuned_models=False):
    assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
    if args_model_type is None:
        model_types = list(MODEL_CLASSES.keys())
    else:
        model_types = [args_model_type]
    for j, model_type in enumerate(model_types, start=1):
        print("=" * 100)
        print(" Converting model type {}/{}: {}".format(j, len(model_types), model_type))
        print("=" * 100)
        if model_type not in MODEL_CLASSES:
            raise ValueError("Unrecognized model type {}, should be one of {}.".format(model_type, list(MODEL_CLASSES.keys())))
        config_class, model_class, loading_fct, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type]
        if model_shortcut_names_or_path is None:
            model_shortcut_names_or_path = list(aws_model_maps.keys())
        if config_shortcut_names_or_path is None:
            config_shortcut_names_or_path = model_shortcut_names_or_path
        for i, (model_shortcut_name, config_shortcut_name) in enumerate(
                zip(model_shortcut_names_or_path, config_shortcut_names_or_path), start=1):
            print("-" * 100)
            if '-squad' in model_shortcut_name or '-mrpc' in model_shortcut_name or '-mnli' in model_shortcut_name:
                if not only_convert_finetuned_models:
                    print("    Skipping finetuned checkpoint {}".format(model_shortcut_name))
                    continue
                model_type = model_shortcut_name
            elif only_convert_finetuned_models:
                print("    Skipping not finetuned checkpoint {}".format(model_shortcut_name))
                continue
            print("    Converting checkpoint {}/{}: {} - model_type {}".format(i, len(aws_config_map), model_shortcut_name, model_type))
            print("-" * 100)
            if config_shortcut_name in aws_config_map:
                config_file = cached_path(aws_config_map[config_shortcut_name], force_download=not use_cached_models)
            else:
                config_file = cached_path(config_shortcut_name, force_download=not use_cached_models)
            if model_shortcut_name in aws_model_maps:
                model_file = cached_path(aws_model_maps[model_shortcut_name], force_download=not use_cached_models)
            else:
                model_file = cached_path(model_shortcut_name, force_download=not use_cached_models)
            convert_pt_checkpoint_to_tf(model_type,
                                        model_file,
                                        config_file,
                                        os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
                                        compare_with_pt_model=compare_with_pt_model)
            os.remove(config_file)
            os.remove(model_file)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    ## Required parameters
    parser.add_argument("--tf_dump_path",
                        default = None,
                        type = str,
                        required = True,
                        help = "Path to the output Tensorflow dump file.")
    parser.add_argument("--model_type",
                        default = None,
                        type = str,
                        help = "Model type selected in the list of {}. If not given, will download and convert all the models from AWS.".format(list(MODEL_CLASSES.keys())))
    parser.add_argument("--pytorch_checkpoint_path",
                        default = None,
                        type = str,
                        help = "Path to the PyTorch checkpoint path or shortcut name to download from AWS. "
                               "If not given, will download and convert all the checkpoints from AWS.")
    parser.add_argument("--config_file",
                        default = None,
                        type = str,
                        help = "The config json file corresponding to the pre-trained model. \n"
                               "This specifies the model architecture. If not given and "
                               "--pytorch_checkpoint_path is not given or is a shortcut name"
                               "use the configuration associated to the shortcut name on the AWS")
    parser.add_argument("--compare_with_pt_model",
                        action='store_true',
                        help = "Compare Tensorflow and PyTorch model predictions.")
    parser.add_argument("--use_cached_models",
                        action='store_true',
                        help = "Use cached models if possible instead of updating to latest checkpoint versions.")
    parser.add_argument("--only_convert_finetuned_models",
                        action='store_true',
                        help = "Only convert finetuned models.")
    args = parser.parse_args()
    # if args.pytorch_checkpoint_path is not None:
    #     convert_pt_checkpoint_to_tf(args.model_type.lower(),
    #                                 args.pytorch_checkpoint_path,
    #                                 args.config_file if args.config_file is not None else args.pytorch_checkpoint_path,
    #                                 args.tf_dump_path,
    #                                 compare_with_pt_model=args.compare_with_pt_model,
    #                                 use_cached_models=args.use_cached_models)
    # else:
    convert_all_pt_checkpoints_to_tf(args.model_type.lower() if args.model_type is not None else None,
                                        args.tf_dump_path,
                                        model_shortcut_names_or_path=[args.pytorch_checkpoint_path] if args.pytorch_checkpoint_path is not None else None,
                                        compare_with_pt_model=args.compare_with_pt_model,
                                        use_cached_models=args.use_cached_models,
                                        only_convert_finetuned_models=args.only_convert_finetuned_models)
--- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -23,12 +23,12 @@ import torch
 from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 from fairseq.modules import TransformerSentenceEncoderLayer
-from pytorch_transformers import (BertConfig, BertEncoder,
+from transformers import (BertConfig, BertEncoder,
                                                BertIntermediate, BertLayer,
                                                BertModel, BertOutput,
                                                BertSelfAttention,
                                                BertSelfOutput)
-from pytorch_transformers import (RobertaEmbeddings,
+from transformers import (RobertaEmbeddings,
                                                   RobertaForMaskedLM,
                                                   RobertaForSequenceClassification,
                                                   RobertaModel)
--- a/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
@@ -23,12 +23,12 @@ from io import open
 import torch
-import pytorch_transformers.tokenization_transfo_xl as data_utils
+import transformers.tokenization_transfo_xl as data_utils
-from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
+from transformers import CONFIG_NAME, WEIGHTS_NAME
-from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
+from transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
                                                      load_tf_weights_in_transfo_xl)
-from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
+from transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
 if sys.version_info[0] == 2:
    import cPickle as pickle
--- a/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
@@ -23,8 +23,8 @@ from io import open
 import torch
 import numpy
-from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
+from transformers import CONFIG_NAME, WEIGHTS_NAME
-from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
+from transformers.tokenization_xlm import VOCAB_FILES_NAMES
 import logging
 logging.basicConfig(level=logging.INFO)
@@ -33,7 +33,15 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
    # Load checkpoint
    chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
-    model = chkpt['model']
+    state_dict = chkpt['model']
    # We have the base model one level deeper than the original XLM repository
    two_levels_state_dict = {}
    for k, v in state_dict.items():
        if 'pred_layer' in k:
            two_levels_state_dict[k] = v
        else:
            two_levels_state_dict['transformer.' + k] = v
    config = chkpt['params']
    config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
@@ -47,7 +55,7 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
    pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' +  VOCAB_FILES_NAMES['vocab_file']
    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
-    torch.save(model, pytorch_weights_dump_path)
+    torch.save(two_levels_state_dict, pytorch_weights_dump_path)
    print("Save configuration file to {}".format(pytorch_config_dump_path))
    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
--- a/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
@@ -22,7 +22,7 @@ import os
 import argparse
 import torch
-from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
+from transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                    XLNetConfig,
                                                    XLNetLMHeadModel, XLNetForQuestionAnswering,
                                                    XLNetForSequenceClassification,
--- a/transformers/data/init.py
+++ b/transformers/data/init.py
@@ -0,0 +1,6 @@
 from .processors import InputExample, InputFeatures, DataProcessor
 from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
 from .metrics import is_sklearn_available
 if is_sklearn_available():
    from .metrics import glue_compute_metrics
--- a/transformers/data/metrics/init.py
+++ b/transformers/data/metrics/init.py
@@ -0,0 +1,83 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import csv
 import sys
 import logging
 logger = logging.getLogger(__name__)
 try:
    from scipy.stats import pearsonr, spearmanr
    from sklearn.metrics import matthews_corrcoef, f1_score
    _has_sklearn = True
 except (AttributeError, ImportError) as e:
    logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html")
    _has_sklearn = False
 def is_sklearn_available():
    return _has_sklearn
 if _has_sklearn:
    def simple_accuracy(preds, labels):
        return (preds == labels).mean()
    def acc_and_f1(preds, labels):
        acc = simple_accuracy(preds, labels)
        f1 = f1_score(y_true=labels, y_pred=preds)
        return {
            "acc": acc,
            "f1": f1,
            "acc_and_f1": (acc + f1) / 2,
        }
    def pearson_and_spearman(preds, labels):
        pearson_corr = pearsonr(preds, labels)[0]
        spearman_corr = spearmanr(preds, labels)[0]
        return {
            "pearson": pearson_corr,
            "spearmanr": spearman_corr,
            "corr": (pearson_corr + spearman_corr) / 2,
        }
    def glue_compute_metrics(task_name, preds, labels):
        assert len(preds) == len(labels)
        if task_name == "cola":
            return {"mcc": matthews_corrcoef(labels, preds)}
        elif task_name == "sst-2":
            return {"acc": simple_accuracy(preds, labels)}
        elif task_name == "mrpc":
            return acc_and_f1(preds, labels)
        elif task_name == "sts-b":
            return pearson_and_spearman(preds, labels)
        elif task_name == "qqp":
            return acc_and_f1(preds, labels)
        elif task_name == "mnli":
            return {"acc": simple_accuracy(preds, labels)}
        elif task_name == "mnli-mm":
            return {"acc": simple_accuracy(preds, labels)}
        elif task_name == "qnli":
            return {"acc": simple_accuracy(preds, labels)}
        elif task_name == "rte":
            return {"acc": simple_accuracy(preds, labels)}
        elif task_name == "wnli":
            return {"acc": simple_accuracy(preds, labels)}
        else:
            raise KeyError(task_name)
--- a/transformers/data/processors/init.py
+++ b/transformers/data/processors/init.py
@@ -0,0 +1,3 @@
 from .utils import InputExample, InputFeatures, DataProcessor
 from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
--- a/transformers/data/processors/glue.py
+++ b/transformers/data/processors/glue.py
@@ -13,84 +13,154 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BERT classification fine-tuning: utilities to work with GLUE tasks """
+""" GLUE processors and helpers """
 from __future__ import absolute_import, division, print_function
 import csv
 import logging
 import os
 import sys
 from io import open
-from scipy.stats import pearsonr, spearmanr
+from .utils import DataProcessor, InputExample, InputFeatures
-from sklearn.metrics import matthews_corrcoef, f1_score
+from ...file_utils import is_tf_available
 if is_tf_available():
    import tensorflow as tf
 logger = logging.getLogger(__name__)
-class InputExample(object):
+def glue_convert_examples_to_features(examples, tokenizer,
-    """A single training/test example for simple sequence classification."""
+                                      max_length=512,
                                      task=None,
                                      label_list=None,
                                      output_mode=None,
                                      pad_on_left=False,
                                      pad_token=0,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):
    """
    Loads a data file into a list of ``InputFeatures``
-    def __init__(self, guid, text_a, text_b=None, label=None):
+    Args:
-        """Constructs a InputExample.
+        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)
-        Args:
+    Returns:
-            guid: Unique id for the example.
+        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
-            text_a: string. The untokenized text of the first sequence. For single
+        containing the task-specific features. If the input is a list of ``InputExamples``, will return
-            sequence tasks, only this sequence must be specified.
+        a list of task-specific ``InputFeatures`` which can be fed to the model.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label
    """
    is_tf_dataset = False
    if is_tf_available() and isinstance(examples, tf.data.Dataset):
        is_tf_dataset = True
-class InputFeatures(object):
+    if task is not None:
-    """A single set of features of data."""
+        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" % (output_mode, task))
-    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+    label_map = {label: i for i, label in enumerate(label_list)}
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % (ex_index))
        if is_tf_dataset:
            example = processor.get_example_from_tensor_dict(example)
-class DataProcessor(object):
+        inputs = tokenizer.encode_plus(
-    """Base class for data converters for sequence classification data sets."""
+            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
            truncate_first_sequence=True  # We're truncating the first sequence in priority
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
-    def get_train_examples(self, data_dir):
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        """Gets a collection of `InputExample`s for the train set."""
+        # tokens are attended to.
-        raise NotImplementedError()
+        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-    def get_dev_examples(self, data_dir):
+        # Zero-pad up to the sequence length.
-        """Gets a collection of `InputExample`s for the dev set."""
+        padding_length = max_length - len(input_ids)
-        raise NotImplementedError()
+        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
-    def get_labels(self):
+        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
-        """Gets the list of labels for this data set."""
+        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
-        raise NotImplementedError()
+        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
-    @classmethod
+        if output_mode == "classification":
-    def _read_tsv(cls, input_file, quotechar=None):
+            label = label_map[example.label]
-        """Reads a tab separated value file."""
+        elif output_mode == "regression":
-        with open(input_file, "r", encoding="utf-8-sig") as f:
+            label = float(example.label)
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+        else:
-            lines = []
+            raise KeyError(output_mode)
-            for line in reader:
+
-                if sys.version_info[0] == 2:
+        if ex_index < 5:
-                    line = list(unicode(cell, 'utf-8') for cell in line)
+            logger.info("*** Example ***")
-                lines.append(line)
+            logger.info("guid: %s" % (example.guid))
-            return lines
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))
        features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids,
                              label=label))
    if is_tf_available() and is_tf_dataset:
        def gen():
            for ex in features:
                yield  ({'input_ids': ex.input_ids,
                         'attention_mask': ex.attention_mask,
                         'token_type_ids': ex.token_type_ids},
                        ex.label)
        return tf.data.Dataset.from_generator(gen,
            ({'input_ids': tf.int32,
              'attention_mask': tf.int32,
              'token_type_ids': tf.int32},
             tf.int64),
            ({'input_ids': tf.TensorShape([None]),
              'attention_mask': tf.TensorShape([None]),
              'token_type_ids': tf.TensorShape([None])},
             tf.TensorShape([])))
    return features
 class MrpcProcessor(DataProcessor):
    """Processor for the MRPC data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['sentence1'].numpy().decode('utf-8'),
                            tensor_dict['sentence2'].numpy().decode('utf-8'),
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
@@ -124,6 +194,13 @@ class MrpcProcessor(DataProcessor):
 class MnliProcessor(DataProcessor):
    """Processor for the MultiNLI data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['premise'].numpy().decode('utf-8'),
                            tensor_dict['hypothesis'].numpy().decode('utf-8'),
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -167,6 +244,13 @@ class MnliMismatchedProcessor(MnliProcessor):
 class ColaProcessor(DataProcessor):
    """Processor for the CoLA data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['sentence'].numpy().decode('utf-8'),
                            None,
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -196,6 +280,13 @@ class ColaProcessor(DataProcessor):
 class Sst2Processor(DataProcessor):
    """Processor for the SST-2 data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['sentence'].numpy().decode('utf-8'),
                            None,
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -227,6 +318,13 @@ class Sst2Processor(DataProcessor):
 class StsbProcessor(DataProcessor):
    """Processor for the STS-B data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['sentence1'].numpy().decode('utf-8'),
                            tensor_dict['sentence2'].numpy().decode('utf-8'),
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -259,6 +357,13 @@ class StsbProcessor(DataProcessor):
 class QqpProcessor(DataProcessor):
    """Processor for the QQP data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['question1'].numpy().decode('utf-8'),
                            tensor_dict['question2'].numpy().decode('utf-8'),
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -294,6 +399,13 @@ class QqpProcessor(DataProcessor):
 class QnliProcessor(DataProcessor):
    """Processor for the QNLI data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['question'].numpy().decode('utf-8'),
                            tensor_dict['sentence'].numpy().decode('utf-8'),
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -302,7 +414,7 @@ class QnliProcessor(DataProcessor):
    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
-            self._read_tsv(os.path.join(data_dir, "dev.tsv")), 
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")),
            "dev_matched")
    def get_labels(self):
@@ -327,6 +439,13 @@ class QnliProcessor(DataProcessor):
 class RteProcessor(DataProcessor):
    """Processor for the RTE data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['sentence1'].numpy().decode('utf-8'),
                            tensor_dict['sentence2'].numpy().decode('utf-8'),
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -359,6 +478,13 @@ class RteProcessor(DataProcessor):
 class WnliProcessor(DataProcessor):
    """Processor for the WNLI data set (GLUE version)."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['sentence1'].numpy().decode('utf-8'),
                            tensor_dict['sentence2'].numpy().decode('utf-8'),
                            str(tensor_dict['label'].numpy()))
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
@@ -387,198 +513,19 @@ class WnliProcessor(DataProcessor):
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples
 glue_tasks_num_labels = {
    "cola": 2,
    "mnli": 3,
    "mrpc": 2,
    "sst-2": 2,
    "sts-b": 1,
    "qqp": 2,
    "qnli": 2,
    "rte": 2,
    "wnli": 2,
 }
-def convert_examples_to_features(examples, label_list, max_seq_length,
+glue_processors = {
                                 tokenizer, output_mode,
                                 cls_token_at_end=False,
                                 cls_token='[CLS]',
                                 cls_token_segment_id=1,
                                 sep_token='[SEP]',
                                 sep_token_extra=False,
                                 pad_on_left=False,
                                 pad_token=0,
                                 pad_token_segment_id=0,
                                 sequence_a_segment_id=0, 
                                 sequence_b_segment_id=1,
                                 mask_padding_with_zero=True):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """
    label_map = {label : i for i, label in enumerate(label_list)}
    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
        tokens_a = tokenizer.tokenize(example.text_a)
        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
            special_tokens_count = 4 if sep_token_extra else 3
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
        else:
            # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
            special_tokens_count = 3 if sep_token_extra else 2
            if len(tokens_a) > max_seq_length - special_tokens_count:
                tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = tokens_a + [sep_token]
        if sep_token_extra:
            # roberta uses an extra separator b/w pairs of sentences
            tokens += [sep_token]
        segment_ids = [sequence_a_segment_id] * len(tokens)
        if tokens_b:
            tokens += tokens_b + [sep_token]
            segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
        if cls_token_at_end:
            tokens = tokens + [cls_token]
            segment_ids = segment_ids + [cls_token_segment_id]
        else:
            tokens = [cls_token] + tokens
            segment_ids = [cls_token_segment_id] + segment_ids
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        if output_mode == "classification":
            label_id = label_map[example.label]
        elif output_mode == "regression":
            label_id = float(example.label)
        else:
            raise KeyError(output_mode)
        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_id))
        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features
 def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""
    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()
 def simple_accuracy(preds, labels):
    return (preds == labels).mean()
 def acc_and_f1(preds, labels):
    acc = simple_accuracy(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds)
    return {
        "acc": acc,
        "f1": f1,
        "acc_and_f1": (acc + f1) / 2,
    }
 def pearson_and_spearman(preds, labels):
    pearson_corr = pearsonr(preds, labels)[0]
    spearman_corr = spearmanr(preds, labels)[0]
    return {
        "pearson": pearson_corr,
        "spearmanr": spearman_corr,
        "corr": (pearson_corr + spearman_corr) / 2,
    }
 def compute_metrics(task_name, preds, labels):
    assert len(preds) == len(labels)
    if task_name == "cola":
        return {"mcc": matthews_corrcoef(labels, preds)}
    elif task_name == "sst-2":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "mrpc":
        return acc_and_f1(preds, labels)
    elif task_name == "sts-b":
        return pearson_and_spearman(preds, labels)
    elif task_name == "qqp":
        return acc_and_f1(preds, labels)
    elif task_name == "mnli":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "mnli-mm":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "qnli":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "rte":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "wnli":
        return {"acc": simple_accuracy(preds, labels)}
    else:
        raise KeyError(task_name)
 processors = {
    "cola": ColaProcessor,
    "mnli": MnliProcessor,
    "mnli-mm": MnliMismatchedProcessor,
@@ -591,7 +538,7 @@ processors = {
    "wnli": WnliProcessor,
 }
-output_modes = {
+glue_output_modes = {
    "cola": "classification",
    "mnli": "classification",
    "mnli-mm": "classification",
@@ -603,15 +550,3 @@ output_modes = {
    "rte": "classification",
    "wnli": "classification",
 }
 GLUE_TASKS_NUM_LABELS = {
    "cola": 2,
    "mnli": 3,
    "mrpc": 2,
    "sst-2": 2,
    "sts-b": 1,
    "qqp": 2,
    "qnli": 2,
    "rte": 2,
    "wnli": 2,
 }
--- a/transformers/data/processors/utils.py
+++ b/transformers/data/processors/utils.py
@@ -0,0 +1,120 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import csv
 import sys
 import copy
 import json
 class InputExample(object):
    """
    A single training/test example for simple sequence classification.
    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label
    def __repr__(self):
        return str(self.to_json_string())
    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output
    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 class InputFeatures(object):
    """
    A single set of features of data.
    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
        label: Label corresponding to the input
    """
    def __init__(self, input_ids, attention_mask, token_type_ids, label):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.label = label
    def __repr__(self):
        return str(self.to_json_string())
    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output
    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""
    def get_example_from_tensor_dict(self, tensor_dict):
        """Gets an example from a dict with tensorflow tensors
        Args:
            tensor_dict: Keys and values should match the corresponding Glue
                tensorflow_dataset examples.
        """
        raise NotImplementedError()
    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()
    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()
    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()
    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8-sig") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines
--- a/pytorch_transformers/file_utils.py
+++ b/pytorch_transformers/file_utils.py
@@ -23,6 +23,24 @@ from botocore.exceptions import ClientError
 import requests
 from tqdm import tqdm
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 try:
    import tensorflow as tf
    assert int(tf.__version__[0]) >= 2
    _tf_available = True  # pylint: disable=invalid-name
    logger.info("TensorFlow version {} available.".format(tf.__version__))
 except (ImportError, AssertionError):
    _tf_available = False  # pylint: disable=invalid-name
 try:
    import torch
    _torch_available = True  # pylint: disable=invalid-name
    logger.info("PyTorch version {} available.".format(torch.__version__))
 except ImportError:
    _torch_available = False  # pylint: disable=invalid-name
 try:
    from torch.hub import _get_torch_home
    torch_cache_home = _get_torch_home()
@@ -30,7 +48,7 @@ except ImportError:
    torch_cache_home = os.path.expanduser(
        os.getenv('TORCH_HOME', os.path.join(
            os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
-default_cache_path = os.path.join(torch_cache_home, 'pytorch_transformers')
+default_cache_path = os.path.join(torch_cache_home, 'transformers')
 try:
    from urllib.parse import urlparse
@@ -47,12 +65,18 @@ except (AttributeError, ImportError):
                                                        default_cache_path))
 PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 WEIGHTS_NAME = "pytorch_model.bin"
 TF2_WEIGHTS_NAME = 'tf_model.h5'
 TF_WEIGHTS_NAME = 'model.ckpt'
 CONFIG_NAME = "config.json"
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+def is_torch_available():
    return _torch_available
 def is_tf_available():
    return _tf_available
 if not six.PY2:
    def add_start_docstrings(*docstr):
@@ -83,6 +107,9 @@ def url_to_filename(url, etag=None):
    Convert `url` into a hashed filename in a repeatable way.
    If `etag` is specified, append its hash to the url's, delimited
    by a period.
    If the url ends with .h5 (Keras HDF5 weights) ands '.h5' to the name
    so that TF 2.0 can identify it as a HDF5 file
    (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
    """
    url_bytes = url.encode('utf-8')
    url_hash = sha256(url_bytes)
@@ -93,6 +120,9 @@ def url_to_filename(url, etag=None):
        etag_hash = sha256(etag_bytes)
        filename += '.' + etag_hash.hexdigest()
    if url.endswith('.h5'):
        filename += '.h5'
    return filename
@@ -102,7 +132,7 @@ def filename_to_url(filename, cache_dir=None):
    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
    """
    if cache_dir is None:
-        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+        cache_dir = TRANSFORMERS_CACHE
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)
@@ -133,7 +163,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
        force_download: if True, re-dowload the file even if it's already cached in the cache dir.
    """
    if cache_dir is None:
-        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+        cache_dir = TRANSFORMERS_CACHE
    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
        url_or_filename = str(url_or_filename)
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
@@ -222,7 +252,7 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
    If it's not there, download it. Then return the path to the cached file.
    """
    if cache_dir is None:
-        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+        cache_dir = TRANSFORMERS_CACHE
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)
    if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -36,7 +36,7 @@ logger = logging.getLogger(__name__)
 class AutoModel(object):
    r"""
-        :class:`~pytorch_transformers.AutoModel` is a generic model class
+        :class:`~transformers.AutoModel` is a generic model class
        that will be instantiated as one of the base model classes of the library
        when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
        class method.
@@ -84,23 +84,23 @@ class AutoModel(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
@@ -120,7 +120,7 @@ class AutoModel(object):
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
@@ -157,7 +157,7 @@ class AutoModel(object):
 class AutoModelWithLMHead(object):
    r"""
-        :class:`~pytorch_transformers.AutoModelWithLMHead` is a generic model class
+        :class:`~transformers.AutoModelWithLMHead` is a generic model class
        that will be instantiated as one of the language modeling model classes of the library
        when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
        class method.
@@ -208,23 +208,23 @@ class AutoModelWithLMHead(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
@@ -244,7 +244,7 @@ class AutoModelWithLMHead(object):
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
@@ -281,7 +281,7 @@ class AutoModelWithLMHead(object):
 class AutoModelForSequenceClassification(object):
    r"""
-        :class:`~pytorch_transformers.AutoModelForSequenceClassification` is a generic model class
+        :class:`~transformers.AutoModelForSequenceClassification` is a generic model class
        that will be instantiated as one of the sequence classification model classes of the library
        when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
        class method.
@@ -326,23 +326,23 @@ class AutoModelForSequenceClassification(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
@@ -362,7 +362,7 @@ class AutoModelForSequenceClassification(object):
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
@@ -392,7 +392,7 @@ class AutoModelForSequenceClassification(object):
 class AutoModelForQuestionAnswering(object):
    r"""
-        :class:`~pytorch_transformers.AutoModelForQuestionAnswering` is a generic model class
+        :class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
        that will be instantiated as one of the question answering model classes of the library
        when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
        class method.
@@ -435,23 +435,23 @@ class AutoModelForQuestionAnswering(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
@@ -471,7 +471,7 @@ class AutoModelForQuestionAnswering(object):
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -118,26 +118,27 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
 def gelu(x):
-    """Implementation of the gelu activation function.
+    """ Original Implementation of the gelu activation function in Google Bert repo when initialy created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
 def gelu_new(x):
    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
        Also see https://arxiv.org/abs/1606.08415
    """
    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 def swish(x):
    return x * torch.sigmoid(x)
-ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new}
-try:
+BertLayerNorm = torch.nn.LayerNorm
    from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
 except (ImportError, AttributeError) as e:
    logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
    BertLayerNorm = torch.nn.LayerNorm
 class BertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings.
@@ -195,7 +196,7 @@ class BertSelfAttention(nn.Module):
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)
-    def forward(self, hidden_states, attention_mask, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)
@@ -207,8 +208,9 @@ class BertSelfAttention(nn.Module):
        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        if attention_mask is not None:
-        attention_scores = attention_scores + attention_mask
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
            attention_scores = attention_scores + attention_mask
        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
@@ -275,7 +277,7 @@ class BertAttention(nn.Module):
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)
-    def forward(self, input_tensor, attention_mask, head_mask=None):
+    def forward(self, input_tensor, attention_mask=None, head_mask=None):
        self_outputs = self.self(input_tensor, attention_mask, head_mask)
        attention_output = self.output(self_outputs[0], input_tensor)
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
@@ -318,7 +320,7 @@ class BertLayer(nn.Module):
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)
-    def forward(self, hidden_states, attention_mask, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
        attention_output = attention_outputs[0]
        intermediate_output = self.intermediate(attention_output)
@@ -334,7 +336,7 @@ class BertEncoder(nn.Module):
        self.output_hidden_states = config.output_hidden_states
        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
-    def forward(self, hidden_states, attention_mask, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
        all_hidden_states = ()
        all_attentions = ()
        for i, layer_module in enumerate(self.layer):
@@ -480,9 +482,9 @@ BERT_START_DOCSTRING = r"""    The BERT model was proposed in
        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
-        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. 
            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 BERT_INPUTS_DOCSTRING = r"""
@@ -506,9 +508,9 @@ BERT_INPUTS_DOCSTRING = r"""
            Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
            the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
+            Indices can be obtained using :class:`transformers.BertTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
--- a/pytorch_transformers/modeling_distilbert.py
+++ b/pytorch_transformers/modeling_distilbert.py
@@ -372,9 +372,9 @@ DISTILBERT_START_DOCSTRING = r"""
        https://medium.com/huggingface/distilbert-8cf3380435b5
    Parameters:
-        config (:class:`~pytorch_transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 DISTILBERT_INPUTS_DOCSTRING = r"""
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -280,9 +280,9 @@ GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
-        config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 GPT2_INPUTS_DOCSTRING = r"""    Inputs:
@@ -290,9 +290,9 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
            Indices of input sequence tokens in the vocabulary.
            GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
            the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.GPT2Tokenizer`.
+            Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **past**:
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
@@ -367,6 +367,13 @@ class GPT2Model(GPT2PreTrainedModel):
            self.h[layer].attn.prune_heads(heads)
    def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_shape[-1])
        if token_type_ids is not None:
            token_type_ids = token_type_ids.view(-1, input_shape[-1])
        if position_ids is not None:
            position_ids = position_ids.view(-1, input_shape[-1])
        if past is None:
            past_length = 0
            past = [None] * len(self.h)
@@ -378,6 +385,7 @@ class GPT2Model(GPT2PreTrainedModel):
        # Attention mask.
        if attention_mask is not None:
            attention_mask = attention_mask.view(-1, input_shape[-1])
            # We create a 3D attention mask from a 2D tensor mask.
            # Sizes are [batch_size, 1, 1, to_seq_length]
            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
@@ -407,14 +415,9 @@ class GPT2Model(GPT2PreTrainedModel):
        else:
            head_mask = [None] * self.config.n_layer
        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_ids.size(-1))
        position_ids = position_ids.view(-1, position_ids.size(-1))
        inputs_embeds = self.wte(input_ids)
        position_embeds = self.wpe(position_ids)
        if token_type_ids is not None:
            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
            token_type_embeds = self.wte(token_type_ids)
        else:
            token_type_embeds = 0
@@ -490,7 +493,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
    Examples::
        import torch
-        from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
+        from transformers import GPT2Tokenizer, GPT2LMHeadModel
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = GPT2LMHeadModel.from_pretrained('gpt2')
@@ -586,7 +589,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    Examples::
        import torch
-        from pytorch_transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
+        from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -294,9 +294,9 @@ OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
-        config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
@@ -304,9 +304,9 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
            Indices of input sequence tokens in the vocabulary.
            GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
            the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
+            Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -43,6 +43,9 @@ class RobertaEmbeddings(BertEmbeddings):
    def __init__(self, config):
        super(RobertaEmbeddings, self).__init__(config)
        self.padding_idx = 1
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size,
                                                padding_idx=self.padding_idx)
    def forward(self, input_ids, token_type_ids=None, position_ids=None):
        seq_length = input_ids.size(1)
@@ -77,9 +80,9 @@ ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
-        config (:class:`~pytorch_transformers.RobertaConfig`): Model configuration class with all the parameters of the 
+        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the 
            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 ROBERTA_INPUTS_DOCSTRING = r"""
@@ -102,8 +105,8 @@ ROBERTA_INPUTS_DOCSTRING = r"""
            RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
            the right rather than the left.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
@@ -361,9 +364,9 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
                ``token_type_ids:   0   0   0   0  0     0   0``
-            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
+            Indices can be obtained using :class:`transformers.BertTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
            Segment token indices to indicate first and second portions of the inputs.
            The second dimension of the input (`num_choices`) indicates the number of choices to score.
--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -0,0 +1,501 @@
 # coding=utf-8
 # Copyright 2018 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Auto Model class. """
 from __future__ import absolute_import, division, print_function, unicode_literals
 import logging
 from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, TFBertForQuestionAnswering
 from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel
 from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel
 from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel
 from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, TFXLNetForQuestionAnsweringSimple
 from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple
 from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification
 from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification
 from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 class TFAutoModel(object):
    r"""
        :class:`~transformers.TFAutoModel` is a generic model class
        that will be instantiated as one of the base model classes of the library
        when created with the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)`
        class method.
        The `from_pretrained()` method takes care of returning the correct model class instance
        using pattern matching on the `pretrained_model_name_or_path` string.
        The base model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: TFDistilBertModel (DistilBERT model)
            - contains `roberta`: TFRobertaModel (RoBERTa model)
            - contains `bert`: TFBertModel (Bert model)
            - contains `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
            - contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
            - contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
            - contains `xlnet`: TFXLNetModel (XLNet model)
            - contains `xlm`: TFXLMModel (XLM model)
        This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
        raise EnvironmentError("TFAutoModel is designed to be instantiated "
            "using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` method.")
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        r""" Instantiates one of the base model classes of the library
        from a pre-trained model configuration.
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: TFDistilBertModel (DistilBERT model)
            - contains `roberta`: TFRobertaModel (RoBERTa model)
            - contains `bert`: TFTFBertModel (Bert model)
            - contains `openai-gpt`: TFOpenAIGPTModel (OpenAI GPT model)
            - contains `gpt2`: TFGPT2Model (OpenAI GPT-2 model)
            - contains `transfo-xl`: TFTransfoXLModel (Transformer-XL model)
            - contains `xlnet`: TFXLNetModel (XLNet model)
            - contains `xlm`: TFXLMModel (XLM model)
        Params:
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
            from_pt: (`Optional`) Boolean
                Set to True if the Checkpoint is a PyTorch checkpoint.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
            output_loading_info: (`optional`) boolean:
                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
            kwargs: (`optional`) Remaining dictionary of keyword arguments:
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
            model = TFAutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = TFAutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = TFAutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
            model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
        """
        if 'distilbert' in pretrained_model_name_or_path:
            return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
            return TFBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'openai-gpt' in pretrained_model_name_or_path:
            return TFOpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'gpt2' in pretrained_model_name_or_path:
            return TFGPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'transfo-xl' in pretrained_model_name_or_path:
            return TFTransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlnet' in pretrained_model_name_or_path:
            return TFXLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlm' in pretrained_model_name_or_path:
            return TFXLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
 class TFAutoModelWithLMHead(object):
    r"""
        :class:`~transformers.TFAutoModelWithLMHead` is a generic model class
        that will be instantiated as one of the language modeling model classes of the library
        when created with the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
        class method.
        The `from_pretrained()` method takes care of returning the correct model class instance
        using pattern matching on the `pretrained_model_name_or_path` string.
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
            - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
            - contains `bert`: TFBertForMaskedLM (Bert model)
            - contains `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
            - contains `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
            - contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
            - contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
            - contains `xlm`: TFXLMWithLMHeadModel (XLM model)
        This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        r""" Instantiates one of the language modeling model classes of the library
        from a pre-trained model configuration.
        The `from_pretrained()` method takes care of returning the correct model class instance
        using pattern matching on the `pretrained_model_name_or_path` string.
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
            - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
            - contains `bert`: TFBertForMaskedLM (Bert model)
            - contains `openai-gpt`: TFOpenAIGPTLMHeadModel (OpenAI GPT model)
            - contains `gpt2`: TFGPT2LMHeadModel (OpenAI GPT-2 model)
            - contains `transfo-xl`: TFTransfoXLLMHeadModel (Transformer-XL model)
            - contains `xlnet`: TFXLNetLMHeadModel (XLNet model)
            - contains `xlm`: TFXLMWithLMHeadModel (XLM model)
        Params:
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
            from_pt: (`Optional`) Boolean
                Set to True if the Checkpoint is a PyTorch checkpoint.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
            output_loading_info: (`optional`) boolean:
                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
            kwargs: (`optional`) Remaining dictionary of keyword arguments:
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
            model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = TFAutoModelWithLMHead.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = TFAutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
            model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
        """
        if 'distilbert' in pretrained_model_name_or_path:
            return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
            return TFBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'openai-gpt' in pretrained_model_name_or_path:
            return TFOpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'gpt2' in pretrained_model_name_or_path:
            return TFGPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'transfo-xl' in pretrained_model_name_or_path:
            return TFTransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlnet' in pretrained_model_name_or_path:
            return TFXLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlm' in pretrained_model_name_or_path:
            return TFXLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
 class TFAutoModelForSequenceClassification(object):
    r"""
        :class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class
        that will be instantiated as one of the sequence classification model classes of the library
        when created with the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
        class method.
        The `from_pretrained()` method takes care of returning the correct model class instance
        using pattern matching on the `pretrained_model_name_or_path` string.
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
            - contains `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
            - contains `bert`: TFBertForSequenceClassification (Bert model)
            - contains `xlnet`: TFXLNetForSequenceClassification (XLNet model)
            - contains `xlm`: TFXLMForSequenceClassification (XLM model)
        This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        r""" Instantiates one of the sequence classification model classes of the library
        from a pre-trained model configuration.
        The `from_pretrained()` method takes care of returning the correct model class instance
        using pattern matching on the `pretrained_model_name_or_path` string.
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model)
            - contains `roberta`: TFRobertaForSequenceClassification (RoBERTa model)
            - contains `bert`: TFBertForSequenceClassification (Bert model)
            - contains `xlnet`: TFXLNetForSequenceClassification (XLNet model)
            - contains `xlm`: TFXLMForSequenceClassification (XLM model)
        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
        To train the model, you should first set it back in training mode with `model.train()`
        Params:
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
            from_pt: (`Optional`) Boolean
                Set to True if the Checkpoint is a PyTorch checkpoint.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
            output_loading_info: (`optional`) boolean:
                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
            kwargs: (`optional`) Remaining dictionary of keyword arguments:
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
            model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = TFAutoModelForSequenceClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
            model = TFAutoModelForSequenceClassification.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
        """
        if 'distilbert' in pretrained_model_name_or_path:
            return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
            return TFBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlnet' in pretrained_model_name_or_path:
            return TFXLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlm' in pretrained_model_name_or_path:
            return TFXLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
 class TFAutoModelForQuestionAnswering(object):
    r"""
        :class:`~transformers.TFAutoModelForQuestionAnswering` is a generic model class
        that will be instantiated as one of the question answering model classes of the library
        when created with the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
        class method.
        The `from_pretrained()` method takes care of returning the correct model class instance
        using pattern matching on the `pretrained_model_name_or_path` string.
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
            - contains `bert`: TFBertForQuestionAnswering (Bert model)
            - contains `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
            - contains `xlm`: TFXLMForQuestionAnswering (XLM model)
        This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
        raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated "
            "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        r""" Instantiates one of the question answering model classes of the library
        from a pre-trained model configuration.
        The `from_pretrained()` method takes care of returning the correct model class instance
        using pattern matching on the `pretrained_model_name_or_path` string.
        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
            - contains `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model)
            - contains `bert`: TFBertForQuestionAnswering (Bert model)
            - contains `xlnet`: TFXLNetForQuestionAnswering (XLNet model)
            - contains `xlm`: TFXLMForQuestionAnswering (XLM model)
        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
        To train the model, you should first set it back in training mode with `model.train()`
        Params:
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
            from_pt: (`Optional`) Boolean
                Set to True if the Checkpoint is a PyTorch checkpoint.
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
            output_loading_info: (`optional`) boolean:
                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
            kwargs: (`optional`) Remaining dictionary of keyword arguments:
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
            model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
            model = TFAutoModelForQuestionAnswering.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
            model = TFAutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
            assert model.config.output_attention == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
            model = TFAutoModelForQuestionAnswering.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
        """
        if 'distilbert' in pretrained_model_name_or_path:
            return TFDistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
            return TFBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlnet' in pretrained_model_name_or_path:
            return TFXLNetForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'xlm' in pretrained_model_name_or_path:
            return TFXLMForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -0,0 +1,745 @@
 # coding=utf-8
 # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ TF 2.0 DistilBERT model
 """
 from __future__ import absolute_import, division, print_function, unicode_literals
 import json
 import logging
 import math
 import copy
 import sys
 from io import open
 import itertools
 import numpy as np
 import tensorflow as tf
 from .configuration_distilbert import DistilBertConfig
 from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer
 from .file_utils import add_start_docstrings
 from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 logger = logging.getLogger(__name__)
 TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5",
    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5"
 }
 ### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
 def gelu(x):
    """ Gaussian Error Linear Unit.
    Original Implementation of the gelu activation function in Google Bert repo when initialy created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    """
    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
    return x * cdf
 def gelu_new(x):
    """Gaussian Error Linear Unit.
    This is a smoother version of the RELU.
    Original paper: https://arxiv.org/abs/1606.08415
    Args:
        x: float Tensor to perform activation.
    Returns:
        `x` with the GELU activation applied.
    """
    cdf = 0.5 * (1.0 + tf.tanh(
        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
    return x * cdf
 def load_distilbert_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
    # build the network
    inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
    attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
    tf_inputs = [inputs_list, attns_list]
    tfo = tf_model(tf_inputs, training=False)
    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
 class TFEmbeddings(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFEmbeddings, self).__init__(**kwargs)
        self.vocab_size = config.vocab_size
        self.dim = config.dim
        self.initializer_range = config.initializer_range
        self.word_embeddings = TFSharedEmbeddings(config.vocab_size,
                                                  config.dim,
                                                  initializer_range=config.initializer_range,
                                                  name='word_embeddings')  # padding_idx=0)
        self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
                                                             config.dim,
                                                             embeddings_initializer=get_initializer(config.initializer_range),
                                                             name='position_embeddings')
        if config.sinusoidal_pos_embds:
            raise NotImplementedError
        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm")
        self.dropout = tf.keras.layers.Dropout(config.dropout)
    def build(self, input_shape):
        """Build shared word embedding layer """
        with tf.name_scope("word_embeddings"):
            # Create and initialize weights. The random normal initializer was chosen
            # arbitrarily, and works well.
            self.word_embeddings = self.add_weight(
                "weight",
                shape=[self.vocab_size, self.dim],
                initializer=get_initializer(self.initializer_range))
        super(TFEmbeddings, self).build(input_shape)
    def call(self, inputs, mode="embedding", training=False):
        """Get token embeddings of inputs.
        Args:
            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
            mode: string, a valid value is one of "embedding" and "linear".
        Returns:
            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
                linear tensor, float32 with shape [batch_size, length, vocab_size].
        Raises:
            ValueError: if mode is not valid.
        Shared weights logic adapted from
            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
        """
        if mode == "embedding":
            return self._embedding(inputs, training=training)
        elif mode == "linear":
            return self._linear(inputs)
        else:
            raise ValueError("mode {} is not valid.".format(mode))
    def _embedding(self, inputs, training=False):
        """
        Parameters
        ----------
        input_ids: tf.Tensor(bs, max_seq_length)
            The token ids to embed.
        Outputs
        -------
        embeddings: tf.Tensor(bs, max_seq_length, dim)
            The embedded tokens (plus position embeddings, no token_type embeddings)
        """
        if not isinstance(inputs, (tuple, list)):
            input_ids = inputs
            position_ids = None
        else:
            input_ids, position_ids = inputs
        seq_length = tf.shape(input_ids)[1]
        if position_ids is None:
            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
        word_embeddings = tf.gather(self.word_embeddings, input_ids)
        position_embeddings = self.position_embeddings(position_ids)  # (bs, max_seq_length, dim)
        embeddings = word_embeddings + position_embeddings            # (bs, max_seq_length, dim)
        embeddings = self.LayerNorm(embeddings)                       # (bs, max_seq_length, dim)
        embeddings = self.dropout(embeddings, training=training)      # (bs, max_seq_length, dim)
        return embeddings
    def _linear(self, inputs):
        """Computes logits by running inputs through a linear layer.
            Args:
                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
            Returns:
                float32 tensor with shape [batch_size, length, vocab_size].
        """
        batch_size = tf.shape(inputs)[0]
        length = tf.shape(inputs)[1]
        x = tf.reshape(inputs, [-1, self.dim])
        logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
        return tf.reshape(logits, [batch_size, length, self.vocab_size])
 class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFMultiHeadSelfAttention, self).__init__(**kwargs)
        self.n_heads = config.n_heads
        self.dim = config.dim
        self.dropout = tf.keras.layers.Dropout(config.attention_dropout)
        self.output_attentions = config.output_attentions
        assert self.dim % self.n_heads == 0
        self.q_lin = tf.keras.layers.Dense(config.dim,
                                           kernel_initializer=get_initializer(config.initializer_range),
                                           name="q_lin")
        self.k_lin = tf.keras.layers.Dense(config.dim,
                                           kernel_initializer=get_initializer(config.initializer_range),
                                           name="k_lin")
        self.v_lin = tf.keras.layers.Dense(config.dim,
                                           kernel_initializer=get_initializer(config.initializer_range),
                                           name="v_lin")
        self.out_lin = tf.keras.layers.Dense(config.dim,
                                           kernel_initializer=get_initializer(config.initializer_range),
                                           name="out_lin")
        self.pruned_heads = set()
    def prune_heads(self, heads):
        raise NotImplementedError
    def call(self, inputs, training=False):
        """
        Parameters
        ----------
        query: tf.Tensor(bs, seq_length, dim)
        key: tf.Tensor(bs, seq_length, dim)
        value: tf.Tensor(bs, seq_length, dim)
        mask: tf.Tensor(bs, seq_length)
        Outputs
        -------
        weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
            Attention weights
        context: tf.Tensor(bs, seq_length, dim)
            Contextualized layer. Optional: only if `output_attentions=True`
        """
        query, key, value, mask, head_mask = inputs
        bs, q_length, dim = shape_list(query)
        k_length = shape_list(key)[1]
        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
        # assert key.size() == value.size()
        dim_per_head = self.dim // self.n_heads
        assert 2 <= len(tf.shape(mask)) <= 3
        causal = (len(tf.shape(mask)) == 3)
        mask_reshape = [bs, 1, 1, k_length]
        def shape(x):
            """ separate heads """
            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
        def unshape(x):
            """ group heads """
            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
        q = shape(self.q_lin(query))           # (bs, n_heads, q_length, dim_per_head)
        k = shape(self.k_lin(key))             # (bs, n_heads, k_length, dim_per_head)
        v = shape(self.v_lin(value))           # (bs, n_heads, k_length, dim_per_head)
        q = q / math.sqrt(dim_per_head)                     # (bs, n_heads, q_length, dim_per_head)
        scores = tf.matmul(q, k, transpose_b=True)          # (bs, n_heads, q_length, k_length)
        mask = tf.reshape(mask, mask_reshape)                           # (bs, n_heads, qlen, klen)
        # scores.masked_fill_(mask, -float('inf'))            # (bs, n_heads, q_length, k_length)
        scores = scores - 1e30 * (1.0 - mask)
        weights = tf.nn.softmax(scores, axis=-1)                              # (bs, n_heads, qlen, klen)
        weights = self.dropout(weights, training=training)                    # (bs, n_heads, qlen, klen)
        # Mask heads if we want to
        if head_mask is not None:
            weights = weights * head_mask
        context = tf.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
        context = unshape(context)             # (bs, q_length, dim)
        context = self.out_lin(context)        # (bs, q_length, dim)
        if self.output_attentions:
            return (context, weights)
        else:
            return (context,)
 class TFFFN(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFFFN, self).__init__(**kwargs)
        self.dropout = tf.keras.layers.Dropout(config.dropout)
        self.lin1 = tf.keras.layers.Dense(config.hidden_dim,
                                          kernel_initializer=get_initializer(config.initializer_range),
                                          name="lin1")
        self.lin2 = tf.keras.layers.Dense(config.dim,
                                          kernel_initializer=get_initializer(config.initializer_range),
                                          name="lin2")
        assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
        self.activation = tf.keras.layers.Activation(gelu) if config.activation=='gelu' else tf.keras.activations.relu
    def call(self, input, training=False):
        x = self.lin1(input)
        x = self.activation(x)
        x = self.lin2(x)
        x = self.dropout(x, training=training)
        return x
 class TFTransformerBlock(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFTransformerBlock, self).__init__(**kwargs)
        self.n_heads = config.n_heads
        self.dim = config.dim
        self.hidden_dim = config.hidden_dim
        self.dropout = tf.keras.layers.Dropout(config.dropout)
        self.activation = config.activation
        self.output_attentions = config.output_attentions
        assert config.dim % config.n_heads == 0
        self.attention = TFMultiHeadSelfAttention(config, name="attention")
        self.sa_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="sa_layer_norm")
        self.ffn = TFFFN(config, name="ffn")
        self.output_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="output_layer_norm")
    def call(self, inputs, training=False):  # removed: src_enc=None, src_len=None
        """
        Parameters
        ----------
        x: tf.Tensor(bs, seq_length, dim)
        attn_mask: tf.Tensor(bs, seq_length)
        Outputs
        -------
        sa_weights: tf.Tensor(bs, n_heads, seq_length, seq_length)
            The attention weights
        ffn_output: tf.Tensor(bs, seq_length, dim)
            The output of the transformer block contextualization.
        """
        x, attn_mask, head_mask = inputs
        # Self-Attention
        sa_output = self.attention([x, x, x, attn_mask, head_mask], training=training)
        if self.output_attentions:
            sa_output, sa_weights = sa_output                  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
        else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples
            # assert type(sa_output) == tuple
            sa_output = sa_output[0]
        sa_output = self.sa_layer_norm(sa_output + x)          # (bs, seq_length, dim)
        # Feed Forward Network
        ffn_output = self.ffn(sa_output, training=training)                             # (bs, seq_length, dim)
        ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)
        output = (ffn_output,)
        if self.output_attentions:
            output = (sa_weights,) + output
        return output
 class TFTransformer(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFTransformer, self).__init__(**kwargs)
        self.n_layers = config.n_layers
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
        self.layer = [TFTransformerBlock(config, name='layer_._{}'.format(i))
                      for i in range(config.n_layers)]
    def call(self, inputs, training=False):
        """
        Parameters
        ----------
        x: tf.Tensor(bs, seq_length, dim)
            Input sequence embedded.
        attn_mask: tf.Tensor(bs, seq_length)
            Attention mask on the sequence.
        Outputs
        -------
        hidden_state: tf.Tensor(bs, seq_length, dim)
            Sequence of hiddens states in the last (top) layer
        all_hidden_states: Tuple[tf.Tensor(bs, seq_length, dim)]
            Tuple of length n_layers with the hidden states from each layer.
            Optional: only if output_hidden_states=True
        all_attentions: Tuple[tf.Tensor(bs, n_heads, seq_length, seq_length)]
            Tuple of length n_layers with the attention weights from each layer
            Optional: only if output_attentions=True
        """
        x, attn_mask, head_mask = inputs
        all_hidden_states = ()
        all_attentions = ()
        hidden_state = x
        for i, layer_module in enumerate(self.layer):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_state,)
            layer_outputs = layer_module([hidden_state, attn_mask, head_mask[i]], training=training)
            hidden_state = layer_outputs[-1]
            if self.output_attentions:
                assert len(layer_outputs) == 2
                attentions = layer_outputs[0]
                all_attentions = all_attentions + (attentions,)
            else:
                assert len(layer_outputs) == 1
        # Add last layer
        if self.output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_state,)
        outputs = (hidden_state,)
        if self.output_hidden_states:
            outputs = outputs + (all_hidden_states,)
        if self.output_attentions:
            outputs = outputs + (all_attentions,)
        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
 class TFDistilBertMainLayer(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFDistilBertMainLayer, self).__init__(**kwargs)
        self.num_hidden_layers = config.num_hidden_layers
        self.embeddings = TFEmbeddings(config, name="embeddings")   # Embeddings
        self.transformer = TFTransformer(config, name="transformer") # Encoder
    def _resize_token_embeddings(self, new_num_tokens):
        raise NotImplementedError
    def _prune_heads(self, heads_to_prune):
        raise NotImplementedError
    def call(self, inputs, attention_mask=None, head_mask=None, training=False):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
            head_mask = inputs[2] if len(inputs) > 2 else head_mask
            assert len(inputs) <= 3, "Too many inputs."
        elif isinstance(inputs, dict):
            input_ids = inputs.get('input_ids')
            attention_mask = inputs.get('attention_mask', attention_mask)
            head_mask = inputs.get('head_mask', head_mask)
            assert len(inputs) <= 3, "Too many inputs."
        else:
            input_ids = inputs
        if attention_mask is None:
            attention_mask = tf.ones(shape_list(input_ids)) # (bs, seq_length)
        attention_mask = tf.cast(attention_mask, dtype=tf.float32)
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        if head_mask is not None:
            raise NotImplementedError
        else:
            head_mask = [None] * self.num_hidden_layers
        embedding_output = self.embeddings(input_ids)   # (bs, seq_length, dim)
        tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training)
        return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions)
 ### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
 class TFDistilBertPreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for downloading and loading pretrained models.
    """
    config_class = DistilBertConfig
    pretrained_model_archive_map = TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
    load_pt_weights = load_distilbert_pt_weights_in_tf2
    base_model_prefix = "distilbert"
 DISTILBERT_START_DOCSTRING = r"""
    DistilBERT is a small, fast, cheap and light Transformer model
    trained by distilling Bert base. It has 40% less parameters than
    `bert-base-uncased`, runs 60% faster while preserving over 95% of
    Bert's performances as measured on the GLUE language understanding benchmark.
    Here are the differences between the interface of Bert and DistilBert:
    - DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
    - DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.
    For more information on DistilBERT, please refer to our
    `detailed blog post`_
    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
    .. _`detailed blog post`:
        https://medium.com/huggingface/distilbert-8cf3380435b5
    .. _`tf.keras.Model`:
        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
    Note on the model inputs:
        TF 2.0 models accepts two formats as inputs:
            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.
        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
    Parameters:
        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 DISTILBERT_INPUTS_DOCSTRING = r"""
    Inputs:
        **input_ids** ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            Indices of input sequence tokens in the vocabulary.
            The input sequences should start with `[CLS]` and end with `[SEP]` tokens.
            For now, ONLY BertTokenizer(`bert-base-uncased`) is supported and you should use this tokenizer when using DistilBERT.
        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
 class TFDistilBertModel(TFDistilBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
            Sequence of hidden-states at the output of the last layer of the model.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import DistilBertTokenizer, TFDistilBertModel
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFDistilBertModel, self).__init__(config, *inputs, **kwargs)
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")   # Embeddings
    def call(self, inputs, **kwargs):
        outputs = self.distilbert(inputs, **kwargs)
        return outputs
 class TFDistilBertLMHead(tf.keras.layers.Layer):
    def __init__(self, config, input_embeddings, **kwargs):
        super(TFDistilBertLMHead, self).__init__(**kwargs)
        self.vocab_size = config.vocab_size
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.input_embeddings = input_embeddings
    def build(self, input_shape):
        self.bias = self.add_weight(shape=(self.vocab_size,),
                                    initializer='zeros',
                                    trainable=True,
                                    name='bias')
        super(TFDistilBertLMHead, self).build(input_shape)
    def call(self, hidden_states):
        hidden_states = self.input_embeddings(hidden_states, mode="linear")
        hidden_states = hidden_states + self.bias
        return hidden_states
@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
 class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids, masked_lm_labels=input_ids)
        prediction_scores = outputs[0]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFDistilBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
        self.vocab_size = config.vocab_size
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
        self.vocab_transform = tf.keras.layers.Dense(config.dim,
                                                     kernel_initializer=get_initializer(config.initializer_range),
                                                     name="vocab_transform")
        self.act = tf.keras.layers.Activation(gelu)
        self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm")
        self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector")
    def call(self, inputs, **kwargs):
        distilbert_output = self.distilbert(inputs, **kwargs)
        hidden_states = distilbert_output[0]                               # (bs, seq_length, dim)
        prediction_logits = self.vocab_transform(hidden_states)       # (bs, seq_length, dim)
        prediction_logits = self.act(prediction_logits)               # (bs, seq_length, dim)
        prediction_logits = self.vocab_layer_norm(prediction_logits)  # (bs, seq_length, dim)
        prediction_logits = self.vocab_projector(prediction_logits)
        outputs = (prediction_logits,) + distilbert_output[1:]
        return outputs  # logits, (hidden_states), (attentions)
@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
                         the pooled output) e.g. for GLUE tasks. """,
                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
 class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import BertTokenizer, TFDistilBertForSequenceClassification
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        logits = outputs[0]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFDistilBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
        self.pre_classifier = tf.keras.layers.Dense(config.dim,
                                                    kernel_initializer=get_initializer(config.initializer_range),
                                                    activation='relu',
                                                    name="pre_classifier")
        self.classifier = tf.keras.layers.Dense(config.num_labels,
                                                kernel_initializer=get_initializer(config.initializer_range),
                                                name="classifier")
        self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout)
    def call(self, inputs, **kwargs):
        distilbert_output = self.distilbert(inputs, **kwargs)
        hidden_state = distilbert_output[0]                    # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]                    # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)   # (bs, dim)
        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))         # (bs, dim)
        logits = self.classifier(pooled_output)              # (bs, dim)
        outputs = (logits,) + distilbert_output[1:]
        return outputs  # logits, (hidden_states), (attentions)
@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
                         the hidden-states output to compute `span start logits` and `span end logits`). """,
                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
 class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **start_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
            Span-start scores (before SoftMax).
        **end_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
            Span-end scores (before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import BertTokenizer, TFDistilBertForQuestionAnswering
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        start_positions = tf.constant([1])
        end_positions = tf.constant([3])
        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
        start_scores, end_scores = outputs[:2]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
        self.distilbert = TFDistilBertMainLayer(config, name="distilbert")
        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
                                                kernel_initializer=get_initializer(config.initializer_range),
                                                name='qa_outputs')
        assert config.num_labels == 2
        self.dropout = tf.keras.layers.Dropout(config.qa_dropout)
    def call(self, inputs, **kwargs):
        distilbert_output = self.distilbert(inputs, **kwargs)
        hidden_states = distilbert_output[0]                                 # (bs, max_query_len, dim)
        hidden_states = self.dropout(hidden_states, training=kwargs.get('training', False))                       # (bs, max_query_len, dim)
        logits = self.qa_outputs(hidden_states)                           # (bs, max_query_len, 2)
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)
        outputs = (start_logits, end_logits,) + distilbert_output[1:]
        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
--- a/transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -0,0 +1,613 @@
 # coding=utf-8
 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ TF 2.0 OpenAI GPT-2 model. """
 from __future__ import absolute_import, division, print_function, unicode_literals
 import collections
 import json
 import logging
 import math
 import os
 import sys
 from io import open
 import numpy as np
 import tensorflow as tf
 from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
                                TFSequenceSummary, shape_list, get_initializer)
 from .configuration_gpt2 import GPT2Config
 from .file_utils import add_start_docstrings
 from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 logger = logging.getLogger(__name__)
 TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5"}
 def load_gpt2_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
    # build the network
    inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
    tf_inputs = tf.constant(inputs_list)
    tfo = tf_model(tf_inputs, training=False)
    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
 def gelu(x):
    """Gaussian Error Linear Unit.
    This is a smoother version of the RELU.
    Original paper: https://arxiv.org/abs/1606.08415
    Args:
        x: float Tensor to perform activation.
    Returns:
        `x` with the GELU activation applied.
    """
    cdf = 0.5 * (1.0 + tf.tanh(
        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
    return x * cdf
 class TFAttention(tf.keras.layers.Layer):
    def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
        super(TFAttention, self).__init__(**kwargs)
        self.output_attentions = config.output_attentions
        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
        assert n_state % config.n_head == 0
        self.n_ctx = n_ctx
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale
        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
        self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
        self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
        self.pruned_heads = set()
    def prune_heads(self, heads):
        pass
    @staticmethod
    def causal_attention_mask(nd, ns, dtype):
        """1's in the lower triangle, counting from the lower right corner.
        Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
        """
        i = tf.range(nd)[:,None]
        j = tf.range(ns)
        m = i >= j - ns + nd
        return tf.cast(m, dtype)
    def _attn(self, inputs, training=False):
        q, k, v, attention_mask, head_mask = inputs
        # q, k, v have shape [batch, heads, sequence, features]
        w = tf.matmul(q, k, transpose_b=True)
        if self.scale:
            dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores
            w = w / tf.math.sqrt(dk)
        # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
        _, _, nd, ns = shape_list(w)
        b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
        b = tf.reshape(b, [1, 1, nd, ns])
        w = w * b - 1e4 * (1 - b)
        if attention_mask is not None:
            # Apply the attention mask
            w = w + attention_mask
        w = tf.nn.softmax(w, axis=-1)
        w = self.attn_dropout(w, training=training)
        # Mask heads if we want to
        if head_mask is not None:
            w = w * head_mask
        outputs = [tf.matmul(w, v)]
        if self.output_attentions:
            outputs.append(w)
        return outputs
    def merge_heads(self, x):
        x = tf.transpose(x, [0, 2, 1, 3])
        x_shape = shape_list(x)
        new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
        return tf.reshape(x, new_x_shape)
    def split_heads(self, x):
        x_shape = shape_list(x)
        new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
        x = tf.reshape(x, new_x_shape)
        return tf.transpose(x, (0, 2, 1, 3))  # (batch, head, seq_length, head_features)
    def call(self, inputs, training=False):
        x, layer_past, attention_mask, head_mask = inputs
        x = self.c_attn(x)
        query, key, value = tf.split(x, 3, axis=2)
        query = self.split_heads(query)
        key = self.split_heads(key)
        value = self.split_heads(value)
        if layer_past is not None:
            past_key, past_value = tf.unstack(layer_past, axis=1)
            key = tf.concat([past_key, key], axis=-2)
            value = tf.concat([past_value, value], axis=-2)
        present = tf.stack([key, value], axis=1)
        attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training)
        a = attn_outputs[0]
        a = self.merge_heads(a)
        a = self.c_proj(a)
        a = self.resid_dropout(a, training=training)
        outputs = [a, present] + attn_outputs[1:]
        return outputs  # a, present, (attentions)
 class TFMLP(tf.keras.layers.Layer):
    def __init__(self, n_state, config, **kwargs):
        super(TFMLP, self).__init__(**kwargs)
        nx = config.n_embd
        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
        self.act = gelu
        self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
    def call(self, x, training=False):
        h = self.act(self.c_fc(x))
        h2 = self.c_proj(h)
        h2 = self.dropout(h2, training=training)
        return h2
 class TFBlock(tf.keras.layers.Layer):
    def __init__(self, n_ctx, config, scale=False, **kwargs):
        super(TFBlock, self).__init__(**kwargs)
        nx = config.n_embd
        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1')
        self.attn = TFAttention(nx, n_ctx, config, scale, name='attn')
        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2')
        self.mlp = TFMLP(4 * nx, config, name='mlp')
    def call(self, inputs, training=False):
        x, layer_past, attention_mask, head_mask = inputs
        a = self.ln_1(x)
        output_attn = self.attn([a, layer_past, attention_mask, head_mask], training=training)
        a = output_attn[0]  # output_attn: a, present, (attentions)
        x = x + a
        m = self.ln_2(x)
        m = self.mlp(m, training=training)
        x = x + m
        outputs = [x] + output_attn[1:]
        return outputs  # x, present, (attentions)
 class TFGPT2MainLayer(tf.keras.layers.Layer):
    def __init__(self, config, *inputs, **kwargs):
        super(TFGPT2MainLayer, self).__init__(config, *inputs, **kwargs)
        self.output_hidden_states = config.output_hidden_states
        self.output_attentions = config.output_attentions
        self.num_hidden_layers = config.n_layer
        self.vocab_size = config.vocab_size
        self.n_embd = config.n_embd
        self.wte = TFSharedEmbeddings(config.vocab_size,
                                      config.hidden_size,
                                      initializer_range=config.initializer_range,
                                      name='wte')
        self.wpe = tf.keras.layers.Embedding(config.n_positions,
                                             config.n_embd,
                                             embeddings_initializer=get_initializer(config.initializer_range),
                                             name='wpe')
        self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
        self.h = [TFBlock(config.n_ctx,
                          config,
                          scale=True,
                          name='h_._{}'.format(i)) for i in range(config.n_layer)]
        self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f')
    def _resize_token_embeddings(self, new_num_tokens):
        raise NotImplementedError
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        raise NotImplementedError
    def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            past = inputs[1] if len(inputs) > 1 else past
            attention_mask = inputs[2] if len(inputs) > 2 else attention_mask
            token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
            position_ids = inputs[4] if len(inputs) > 4 else position_ids
            head_mask = inputs[5] if len(inputs) > 5 else head_mask
            assert len(inputs) <= 6, "Too many inputs."
        elif isinstance(inputs, dict):
            input_ids = inputs.get('input_ids')
            past = inputs.get('past', past)
            attention_mask = inputs.get('attention_mask', attention_mask)
            token_type_ids = inputs.get('token_type_ids', token_type_ids)
            position_ids = inputs.get('position_ids', position_ids)
            head_mask = inputs.get('head_mask', head_mask)
            assert len(inputs) <= 6, "Too many inputs."
        else:
            input_ids = inputs
        if past is None:
            past_length = 0
            past = [None] * len(self.h)
        else:
            past_length = shape_list(past[0][0])[-2]
        if position_ids is None:
            position_ids = tf.range(past_length, shape_list(input_ids)[-1] + past_length, dtype=tf.int32)[tf.newaxis, :]
        if attention_mask is not None:
            # We create a 3D attention mask from a 2D tensor mask.
            # Sizes are [batch_size, 1, 1, to_seq_length]
            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
            # this attention mask is more simple than the triangular masking of causal attention
            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
            attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
            # masked positions, this operation will create a tensor which is 0.0 for
            # positions we want to attend and -10000.0 for masked positions.
            # Since we are adding it to the raw scores before the softmax, this is
            # effectively the same as removing these entirely.
            attention_mask = tf.cast(attention_mask, tf.float32)
            attention_mask = (1.0 - attention_mask) * -10000.0
        else:
            attention_mask = None
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        if not head_mask is None:
            raise NotImplementedError
        else:
            head_mask = [None] * self.num_hidden_layers
            # head_mask = tf.constant([0] * self.num_hidden_layers)
        input_shape = shape_list(input_ids)
        input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
        inputs_embeds = self.wte(input_ids, mode='embedding')
        position_embeds = self.wpe(position_ids)
        if token_type_ids is not None:
            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
            token_type_embeds = self.wte(token_type_ids, mode='embedding')
        else:
            token_type_embeds = 0
        hidden_states = inputs_embeds + position_embeds + token_type_embeds
        hidden_states = self.drop(hidden_states, training=training)
        output_shape = input_shape + [shape_list(hidden_states)[-1]]
        presents = ()
        all_attentions = []
        all_hidden_states = ()
        for i, (block, layer_past) in enumerate(zip(self.h, past)):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
            outputs = block([hidden_states, layer_past, attention_mask, head_mask[i]], training=training)
            hidden_states, present = outputs[:2]
            presents = presents + (present,)
            if self.output_attentions:
                all_attentions.append(outputs[2])
        hidden_states = self.ln_f(hidden_states)
        hidden_states = tf.reshape(hidden_states, output_shape)
        # Add last hidden state
        if self.output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)
        outputs = (hidden_states, presents)
        if self.output_hidden_states:
            outputs = outputs + (all_hidden_states,)
        if self.output_attentions:
            # let the number of heads free (-1) so we can extract attention even after head pruning
            attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
            all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
            outputs = outputs + (all_attentions,)
        return outputs  # last hidden state, presents, (all hidden_states), (attentions)
 class TFGPT2PreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
    config_class = GPT2Config
    pretrained_model_archive_map = TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
    load_pt_weights = load_gpt2_pt_weights_in_tf2
    base_model_prefix = "transformer"
 GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
    `Language Models are Unsupervised Multitask Learners`_
    by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
    It's a causal (unidirectional) transformer pre-trained using  language modeling on a very large
    corpus of ~40 GB of text data.
    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
    .. _`Language Models are Unsupervised Multitask Learners`:
        https://openai.com/blog/better-language-models/
    .. _`tf.keras.Model`:
        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
    Note on the model inputs:
        TF 2.0 models accepts two formats as inputs:
            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.
        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
    Parameters:
        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 GPT2_INPUTS_DOCSTRING = r"""    Inputs:
        **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            Indices of input sequence tokens in the vocabulary.
            GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
            the right rather than the left.
            Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **past**:
            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `past` output below). Can be used to speed up sequential decoding.
        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **token_type_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
            The embeddings from these tokens will be summed with the respective token embeddings.
            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
        **position_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.
        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
                      GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
 class TFGPT2Model(TFGPT2PreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
            Sequence of hidden-states at the last layer of the model.
        **past**:
            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            that contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import GPT2Tokenizer, TFGPT2Model
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = TFGPT2Model.from_pretrained('gpt2')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFGPT2Model, self).__init__(config, *inputs, **kwargs)
        self.transformer = TFGPT2MainLayer(config, name='transformer')
    def call(self, inputs, **kwargs):
        outputs = self.transformer(inputs, **kwargs)
        return outputs
@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
 (linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
 class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **prediction_scores**: `tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        **past**:
            list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            that contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of `tf.Tensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = TFGPT2LMHeadModel.from_pretrained('gpt2')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        logits = outputs[0]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs)
        self.transformer = TFGPT2MainLayer(config, name='transformer')
    def call(self, inputs, **kwargs):
        transformer_outputs = self.transformer(inputs, **kwargs)
        hidden_states = transformer_outputs[0]
        lm_logits = self.transformer.wte(hidden_states, mode="linear")
        outputs = (lm_logits,) + transformer_outputs[1:]
        return outputs  # lm_logits, presents, (all hidden_states), (attentions)
@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
 """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
 class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
    r"""
        **mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``:
            Index of the classification token in each input sequence.
            Selected in the range ``[0, input_ids.size(-1) - 1[``.
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **lm_prediction_scores**: `tf.Tensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        **mc_prediction_scores**: `tf.Tensor`` of shape ``(batch_size, num_choices)``
            Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
        **past**:
            list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            that contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of `tf.Tensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
        # Add a [CLS] to the vocabulary (we should train it also!)
        # This option is currently not implemented in TF 2.0
        raise NotImplementedError
        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
        encoded_choices = [tokenizer.encode(s) for s in choices]
        cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
        input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
        mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1
        outputs = model(input_ids, mc_token_ids=mc_token_ids)
        lm_prediction_scores, mc_prediction_scores = outputs[:2]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
        self.transformer = TFGPT2MainLayer(config, name='transformer')
        self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
    def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            past = inputs[1] if len(inputs) > 1 else past
            attention_mask = inputs[2] if len(inputs) > 2 else attention_mask
            token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
            position_ids = inputs[4] if len(inputs) > 4 else position_ids
            head_mask = inputs[5] if len(inputs) > 5 else head_mask
            mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
            assert len(inputs) <= 7, "Too many inputs."
        elif isinstance(inputs, dict):
            input_ids = inputs.get('input_ids')
            past = inputs.get('past', past)
            attention_mask = inputs.get('attention_mask', attention_mask)
            token_type_ids = inputs.get('token_type_ids', token_type_ids)
            position_ids = inputs.get('position_ids', position_ids)
            head_mask = inputs.get('head_mask', head_mask)
            mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
            assert len(inputs) <= 7, "Too many inputs."
        else:
            input_ids = inputs
        input_shapes = shape_list(input_ids)
        seq_length = input_shapes[-1]
        flat_input_ids = tf.reshape(input_ids, (-1, seq_length))
        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
        flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask]
        transformer_outputs = self.transformer(flat_inputs, training=training)
        hidden_states = transformer_outputs[0]
        hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:])
        lm_logits = self.transformer.wte(hidden_states, mode="linear")
        mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training)
        mc_logits = tf.squeeze(mc_logits, axis=-1)
        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
        return outputs  # lm logits, mc logits, presents, (all hidden_states), (attentions)
--- a/transformers/modeling_tf_openai.py
+++ b/transformers/modeling_tf_openai.py
@@ -0,0 +1,576 @@
 # coding=utf-8
 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ TF 2.0 OpenAI GPT model."""
 from __future__ import absolute_import, division, print_function, unicode_literals
 import collections
 import json
 import logging
 import math
 import os
 import sys
 from io import open
 import numpy as np
 import tensorflow as tf
 from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
                                TFSequenceSummary, shape_list, get_initializer)
 from .configuration_openai import OpenAIGPTConfig
 from .file_utils import add_start_docstrings
 from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 logger = logging.getLogger(__name__)
 TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"}
 def load_openai_gpt_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
    # build the network
    inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
    tf_inputs = tf.constant(inputs_list)
    tfo = tf_model(tf_inputs, training=False)
    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
 def gelu(x):
    """Gaussian Error Linear Unit.
    This is a smoother version of the RELU.
    Original paper: https://arxiv.org/abs/1606.08415
    Args:
        x: float Tensor to perform activation.
    Returns:
        `x` with the GELU activation applied.
    """
    cdf = 0.5 * (1.0 + tf.tanh(
        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
    return x * cdf
 def swish(x):
    return x * tf.math.sigmoid(x)
 ACT_FNS = {"gelu": tf.keras.layers.Activation(gelu),
           "relu": tf.keras.activations.relu,
           "swish": tf.keras.layers.Activation(swish)}
 class TFAttention(tf.keras.layers.Layer):
    def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
        super(TFAttention, self).__init__(**kwargs)
        self.output_attentions = config.output_attentions
        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
        assert n_state % config.n_head == 0
        self.n_ctx = n_ctx
        self.n_head = config.n_head
        self.split_size = n_state
        self.scale = scale
        self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
        self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
        self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
        self.pruned_heads = set()
    def prune_heads(self, heads):
        pass
    @staticmethod
    def causal_attention_mask(nd, ns, dtype):
        """1's in the lower triangle, counting from the lower right corner.
        Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
        """
        i = tf.range(nd)[:,None]
        j = tf.range(ns)
        m = i >= j - ns + nd
        return tf.cast(m, dtype)
    def _attn(self, inputs, training=False):
        q, k, v, attention_mask, head_mask = inputs
        # q, k, v have shape [batch, heads, sequence, features]
        w = tf.matmul(q, k, transpose_b=True)
        if self.scale:
            dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores
            w = w / tf.math.sqrt(dk)
        # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
        _, _, nd, ns = shape_list(w)
        b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
        b = tf.reshape(b, [1, 1, nd, ns])
        w = w * b - 1e4 * (1 - b)
        if attention_mask is not None:
            # Apply the attention mask
            w = w + attention_mask
        w = tf.nn.softmax(w, axis=-1)
        w = self.attn_dropout(w, training=training)
        # Mask heads if we want to
        if head_mask is not None:
            w = w * head_mask
        outputs = [tf.matmul(w, v)]
        if self.output_attentions:
            outputs.append(w)
        return outputs
    def merge_heads(self, x):
        x = tf.transpose(x, [0, 2, 1, 3])
        x_shape = shape_list(x)
        new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
        return tf.reshape(x, new_x_shape)
    def split_heads(self, x):
        x_shape = shape_list(x)
        new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
        x = tf.reshape(x, new_x_shape)
        return tf.transpose(x, (0, 2, 1, 3))  # (batch, head, seq_length, head_features)
    def call(self, inputs, training=False):
        x, attention_mask, head_mask = inputs
        x = self.c_attn(x)
        query, key, value = tf.split(x, 3, axis=2)
        query = self.split_heads(query)
        key = self.split_heads(key)
        value = self.split_heads(value)
        attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training)
        a = attn_outputs[0]
        a = self.merge_heads(a)
        a = self.c_proj(a)
        a = self.resid_dropout(a, training=training)
        outputs = [a] + attn_outputs[1:]
        return outputs  # a, (attentions)
 class TFMLP(tf.keras.layers.Layer):
    def __init__(self, n_state, config, **kwargs):
        super(TFMLP, self).__init__(**kwargs)
        nx = config.n_embd
        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
        self.act = gelu
        self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
    def call(self, x, training=False):
        h = self.act(self.c_fc(x))
        h2 = self.c_proj(h)
        h2 = self.dropout(h2, training=training)
        return h2
 class TFBlock(tf.keras.layers.Layer):
    def __init__(self, n_ctx, config, scale=False, **kwargs):
        super(TFBlock, self).__init__(**kwargs)
        nx = config.n_embd
        self.attn = TFAttention(nx, n_ctx, config, scale, name='attn')
        self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1')
        self.mlp = TFMLP(4 * nx, config, name='mlp')
        self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2')
    def call(self, inputs, training=False):
        x, attention_mask, head_mask = inputs
        output_attn = self.attn([x, attention_mask, head_mask], training=training)
        a = output_attn[0]  # output_attn: a, (attentions)
        n = self.ln_1(x + a)
        m = self.mlp(n, training=training)
        h = self.ln_2(n + m)
        outputs = [h] + output_attn[1:]
        return outputs  # x, (attentions)
 class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
    def __init__(self, config, *inputs, **kwargs):
        super(TFOpenAIGPTMainLayer, self).__init__(config, *inputs, **kwargs)
        self.output_hidden_states = config.output_hidden_states
        self.output_attentions = config.output_attentions
        self.num_hidden_layers = config.n_layer
        self.vocab_size = config.vocab_size
        self.n_embd = config.n_embd
        self.tokens_embed = TFSharedEmbeddings(config.vocab_size,
                                               config.n_embd,
                                               initializer_range=config.initializer_range,
                                               name='tokens_embed')
        self.positions_embed = tf.keras.layers.Embedding(config.n_positions,
                                                         config.n_embd,
                                                         embeddings_initializer=get_initializer(config.initializer_range),
                                                         name='positions_embed')
        self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
        self.h = [TFBlock(config.n_ctx,
                          config,
                          scale=True,
                          name='h_._{}'.format(i)) for i in range(config.n_layer)]
    def _resize_token_embeddings(self, new_num_tokens):
        raise NotImplementedError
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
        raise NotImplementedError
    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
            position_ids = inputs[3] if len(inputs) > 3 else position_ids
            head_mask = inputs[4] if len(inputs) > 4 else head_mask
            assert len(inputs) <= 5, "Too many inputs."
        elif isinstance(inputs, dict):
            input_ids = inputs.get('input_ids')
            attention_mask = inputs.get('attention_mask', attention_mask)
            token_type_ids = inputs.get('token_type_ids', token_type_ids)
            position_ids = inputs.get('position_ids', position_ids)
            head_mask = inputs.get('head_mask', head_mask)
            assert len(inputs) <= 5, "Too many inputs."
        else:
            input_ids = inputs
        if position_ids is None:
            position_ids = tf.range(shape_list(input_ids)[-1], dtype=tf.int32)[tf.newaxis, :]
        if attention_mask is not None:
            # We create a 3D attention mask from a 2D tensor mask.
            # Sizes are [batch_size, 1, 1, to_seq_length]
            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
            # this attention mask is more simple than the triangular masking of causal attention
            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
            attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
            # masked positions, this operation will create a tensor which is 0.0 for
            # positions we want to attend and -10000.0 for masked positions.
            # Since we are adding it to the raw scores before the softmax, this is
            # effectively the same as removing these entirely.
            attention_mask = tf.cast(attention_mask, tf.float32)
            attention_mask = (1.0 - attention_mask) * -10000.0
        else:
            attention_mask = None
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        if not head_mask is None:
            raise NotImplementedError
        else:
            head_mask = [None] * self.num_hidden_layers
            # head_mask = tf.constant([0] * self.num_hidden_layers)
        input_shape = shape_list(input_ids)
        input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
        inputs_embeds = self.tokens_embed(input_ids, mode='embedding')
        position_embeds = self.positions_embed(position_ids)
        if token_type_ids is not None:
            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
            token_type_embeds = self.tokens_embed(token_type_ids, mode='embedding')
        else:
            token_type_embeds = 0
        hidden_states = inputs_embeds + position_embeds + token_type_embeds
        hidden_states = self.drop(hidden_states, training=training)
        output_shape = input_shape + [shape_list(hidden_states)[-1]]
        all_attentions = []
        all_hidden_states = ()
        for i, block in enumerate(self.h):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
            outputs = block([hidden_states, attention_mask, head_mask[i]], training=training)
            hidden_states = outputs[0]
            if self.output_attentions:
                all_attentions.append(outputs[1])
        hidden_states = tf.reshape(hidden_states, output_shape)
        # Add last hidden state
        if self.output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)
        outputs = (hidden_states,)
        if self.output_hidden_states:
            outputs = outputs + (all_hidden_states,)
        if self.output_attentions:
            # let the number of heads free (-1) so we can extract attention even after head pruning
            attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
            all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
            outputs = outputs + (all_attentions,)
        return outputs  # last hidden state, (all hidden_states), (attentions)
 class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
    config_class = OpenAIGPTConfig
    pretrained_model_archive_map = TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
    load_pt_weights = load_openai_gpt_pt_weights_in_tf2
    base_model_prefix = "transformer"
 OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
    `Improving Language Understanding by Generative Pre-Training`_
    by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
    It's a causal (unidirectional) transformer pre-trained using language modeling on a large
    corpus will long range dependencies, the Toronto Book Corpus.
    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
    .. _`Improving Language Understanding by Generative Pre-Training`:
        https://openai.com/blog/language-unsupervised/
    .. _`tf.keras.Model`:
        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
    Note on the model inputs:
        TF 2.0 models accepts two formats as inputs:
            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.
        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
    Parameters:
        config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
        **input_ids**: ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            Indices of input sequence tokens in the vocabulary.
            GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
            the right rather than the left.
            Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **token_type_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
            The embeddings from these tokens will be summed with the respective token embeddings.
            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
        **position_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.
        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
@add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
                      OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
 class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
            Sequence of hidden-states at the last layer of the model.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTModel
        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        model = TFOpenAIGPTModel.from_pretrained('openai-gpt')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFOpenAIGPTModel, self).__init__(config, *inputs, **kwargs)
        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
    def call(self, inputs, **kwargs):
        outputs = self.transformer(inputs, **kwargs)
        return outputs
@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
 (linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
 class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel
        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        logits = outputs[0]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFOpenAIGPTLMHeadModel, self).__init__(config, *inputs, **kwargs)
        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
    def call(self, inputs, **kwargs):
        transformer_outputs = self.transformer(inputs, **kwargs)
        hidden_states = transformer_outputs[0]
        lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear")
        outputs = (lm_logits,) + transformer_outputs[1:]
        return outputs  # lm_logits, (all hidden_states), (attentions)
@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
 the classification head takes as input the input of a specified classification token index in the input sequence).
 """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
 class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
    r"""
        **mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``:
            Index of the classification token in each input sequence.
            Selected in the range ``[0, input_ids.size(-1) - 1[``.
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        **mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
            Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
        # Add a [CLS] to the vocabulary (we should train it also!)
        # This option is currently not implemented in TF 2.0
        raise NotImplementedError
        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
        input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :]  # Batch size 1, 2 choices
        mc_token_ids = tf.constant([input_ids.size(-1), input_ids.size(-1)])[None, :]  # Batch size 1
        outputs = model(input_ids, mc_token_ids=mc_token_ids)
        lm_prediction_scores, mc_prediction_scores = outputs[:2]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
        self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
        self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
            position_ids = inputs[3] if len(inputs) > 3 else position_ids
            head_mask = inputs[4] if len(inputs) > 4 else head_mask
            mc_token_ids = inputs[5] if len(inputs) > 5 else mc_token_ids
            assert len(inputs) <= 6, "Too many inputs."
        elif isinstance(inputs, dict):
            input_ids = inputs.get('input_ids')
            attention_mask = inputs.get('attention_mask', attention_mask)
            token_type_ids = inputs.get('token_type_ids', token_type_ids)
            position_ids = inputs.get('position_ids', position_ids)
            head_mask = inputs.get('head_mask', head_mask)
            mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
            assert len(inputs) <= 6, "Too many inputs."
        else:
            input_ids = inputs
        input_shapes = shape_list(input_ids)
        seq_length = input_shapes[-1]
        flat_input_ids = tf.reshape(input_ids, (-1, seq_length))
        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
        flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask]
        transformer_outputs = self.transformer(flat_inputs, training=training)
        hidden_states = transformer_outputs[0]
        hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:])
        lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear")
        mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training)
        mc_logits = tf.squeeze(mc_logits, axis=-1)
        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
        return outputs  # lm logits, mc logits, (all hidden_states), (attentions)
--- a/transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -0,0 +1,291 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ PyTorch - TF 2.0 general utilities."""
 from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
 import logging
 import os
 import re
 import numpy
 logger = logging.getLogger(__name__)
 DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
 def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=''):
    """ Convert a TF 2.0 model variable name in a pytorch model weight name.
        Conventions for TF2.0 scopes -> PyTorch attribute names conversions:
            - '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
            - '_._' is replaced by a new level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
        return tuple with:
            - pytorch model weight name
            - transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each other
    """
    tf_name = tf_name.replace(':0', '')                       # device ids
    tf_name = re.sub(r'/[^/]*___([^/]*)/', r'/\1/', tf_name)  # '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
    tf_name = tf_name.replace('_._', '/')                     # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
    tf_name = re.sub(r'//+', '/', tf_name)                    # Remove empty levels at the end
    tf_name = tf_name.split('/')                              # Convert from TF2.0 '/' separators to PyTorch '.' separators
    tf_name = tf_name[1:]                                     # Remove level zero
    # When should we transpose the weights
    transpose = bool(tf_name[-1] == 'kernel' or 'emb_projs' in tf_name or 'out_projs' in tf_name)
    # Convert standard TF2.0 names in PyTorch names
    if tf_name[-1] == 'kernel' or tf_name[-1] == 'embeddings' or tf_name[-1] == 'gamma':
        tf_name[-1] = 'weight'
    if tf_name[-1] == 'beta':
        tf_name[-1] = 'bias'
    # Remove prefix if needed
    tf_name = '.'.join(tf_name)
    if start_prefix_to_remove:
        tf_name = tf_name.replace(start_prefix_to_remove, '', 1)
    return tf_name, transpose
 #####################
 ### PyTorch => TF 2.0
 def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
    """ Load pytorch checkpoints in a TF 2.0 model
    """
    try:
        import tensorflow as tf
        import torch
    except ImportError as e:
        logger.error("Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
        raise e
    pt_path = os.path.abspath(pytorch_checkpoint_path)
    logger.info("Loading PyTorch weights from {}".format(pt_path))
    pt_state_dict = torch.load(pt_path, map_location='cpu')
    return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
 def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False):
    """ Load pytorch checkpoints in a TF 2.0 model
    """
    pt_state_dict = pt_model.state_dict()
    return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
 def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, allow_missing_keys=False):
    """ Load pytorch state_dict in a TF 2.0 model.
    """
    try:
        import torch
        import tensorflow as tf
        from tensorflow.python.keras import backend as K
    except ImportError as e:
        logger.error("Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
        raise e
    if tf_inputs is None:
        tf_inputs = tf.constant(DUMMY_INPUTS)
    if tf_inputs is not None:
        tfo = tf_model(tf_inputs, training=False)  # Make sure model is built
    # Adapt state dict - TODO remove this and update the AWS weights files instead
    # Convert old format to new format if needed from a PyTorch state_dict
    old_keys = []
    new_keys = []
    for key in pt_state_dict.keys():
        new_key = None
        if 'gamma' in key:
            new_key = key.replace('gamma', 'weight')
        if 'beta' in key:
            new_key = key.replace('beta', 'bias')
        if new_key:
            old_keys.append(key)
            new_keys.append(new_key)
    for old_key, new_key in zip(old_keys, new_keys):
        pt_state_dict[new_key] = pt_state_dict.pop(old_key)
    # Make sure we are able to load PyTorch base models as well as derived models (with heads)
    # TF models always have a prefix, some of PyTorch models (base ones) don't
    start_prefix_to_remove = ''
    if not any(s.startswith(tf_model.base_model_prefix) for s in pt_state_dict.keys()):
        start_prefix_to_remove = tf_model.base_model_prefix + '.'
    symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
    weight_value_tuples = []
    all_pytorch_weights = set(list(pt_state_dict.keys()))
    for symbolic_weight in symbolic_weights:
        sw_name = symbolic_weight.name
        name, transpose = convert_tf_weight_name_to_pt_weight_name(sw_name, start_prefix_to_remove=start_prefix_to_remove)
        # Find associated numpy array in pytorch model state dict
        assert name in pt_state_dict, "{} not found in PyTorch model".format(name)
        array = pt_state_dict[name].numpy()
        if transpose:
            array = numpy.transpose(array)
        if len(symbolic_weight.shape) < len(array.shape):
            array = numpy.squeeze(array)
        elif len(symbolic_weight.shape) > len(array.shape):
            array = numpy.expand_dims(array, axis=0)
        try:
            assert list(symbolic_weight.shape) == list(array.shape)
        except AssertionError as e:
            e.args += (symbolic_weight.shape, array.shape)
            raise e
        logger.info("Initialize TF weight {}".format(symbolic_weight.name))
        weight_value_tuples.append((symbolic_weight, array))
        all_pytorch_weights.discard(name)
    K.batch_set_value(weight_value_tuples)
    if tf_inputs is not None:
        tfo = tf_model(tf_inputs, training=False)  # Make sure restore ops are run
    logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights))
    return tf_model
 #####################
 ### TF 2.0 => PyTorch
 def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
    """ Load TF 2.0 HDF5 checkpoint in a PyTorch model
        We use HDF5 to easily do transfer learning
        (see https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357).
    """
    try:
        import tensorflow as tf
        import torch
    except ImportError as e:
        logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
        raise e
    import transformers
    tf_path = os.path.abspath(tf_checkpoint_path)
    logger.info("Loading TensorFlow weights from {}".format(tf_checkpoint_path))
    # Instantiate and load the associated TF 2.0 model
    tf_model_class_name = "TF" + pt_model.__class__.__name__  # Add "TF" at the beggining
    tf_model_class = getattr(transformers, tf_model_class_name)
    tf_model = tf_model_class(pt_model.config)
    if tf_inputs is None:
        tf_inputs = tf.constant(DUMMY_INPUTS)
    if tf_inputs is not None:
        tfo = tf_model(tf_inputs, training=False)  # Make sure model is built
    tf_model.load_weights(tf_checkpoint_path, by_name=True)
    return load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=allow_missing_keys)
 def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False):
    """ Load TF 2.0 model in a pytorch model
    """
    weights = tf_model.weights
    return load_tf2_weights_in_pytorch_model(pt_model, weights, allow_missing_keys=allow_missing_keys)
 def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=False):
    """ Load TF2.0 symbolic weights in a PyTorch model
    """
    try:
        import tensorflow as tf
        import torch
    except ImportError as e:
        logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
            "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
        raise e
    new_pt_params_dict = {}
    current_pt_params_dict = dict(pt_model.named_parameters())
    # Make sure we are able to load PyTorch base models as well as derived models (with heads)
    # TF models always have a prefix, some of PyTorch models (base ones) don't
    start_prefix_to_remove = ''
    if not any(s.startswith(pt_model.base_model_prefix) for s in current_pt_params_dict.keys()):
        start_prefix_to_remove = pt_model.base_model_prefix + '.'
    # Build a map from potential PyTorch weight names to TF 2.0 Variables
    tf_weights_map = {}
    for tf_weight in tf_weights:
        pt_name, transpose = convert_tf_weight_name_to_pt_weight_name(tf_weight.name, start_prefix_to_remove=start_prefix_to_remove)
        tf_weights_map[pt_name] = (tf_weight.numpy(), transpose)
    all_tf_weights = set(list(tf_weights_map.keys()))
    loaded_pt_weights_data_ptr = {}
    for pt_weight_name, pt_weight in current_pt_params_dict.items():
        # Handle PyTorch shared weight ()not duplicated in TF 2.0
        if pt_weight.data_ptr() in loaded_pt_weights_data_ptr:
            new_pt_params_dict[pt_weight_name] = loaded_pt_weights_data_ptr[pt_weight.data_ptr()]
            continue
        # Find associated numpy array in pytorch model state dict
        if pt_weight_name not in tf_weights_map:
            raise ValueError("{} not found in TF 2.0 model".format(pt_weight_name))
        array, transpose = tf_weights_map[pt_weight_name]
        if transpose:
            array = numpy.transpose(array)
        if len(pt_weight.shape) < len(array.shape):
            array = numpy.squeeze(array)
        elif len(pt_weight.shape) > len(array.shape):
            array = numpy.expand_dims(array, axis=0)
        try:
            assert list(pt_weight.shape) == list(array.shape)
        except AssertionError as e:
            e.args += (pt_weight.shape, array.shape)
            raise e
        logger.info("Initialize PyTorch weight {}".format(pt_weight_name))
        new_pt_params_dict[pt_weight_name] = torch.from_numpy(array)
        loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array)
        all_tf_weights.discard(pt_weight_name)
    missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False)
    if len(missing_keys) > 0:
        logger.info("Weights of {} not initialized from TF 2.0 model: {}".format(
            pt_model.__class__.__name__, missing_keys))
    if len(unexpected_keys) > 0:
        logger.info("Weights from TF 2.0 model not used in {}: {}".format(
            pt_model.__class__.__name__, unexpected_keys))
    logger.info("Weights or buffers not loaded from TF 2.0 model: {}".format(all_tf_weights))
    return pt_model
--- a/transformers/modeling_tf_roberta.py
+++ b/transformers/modeling_tf_roberta.py
@@ -0,0 +1,382 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ TF 2.0 RoBERTa model. """
 from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
 import logging
 import numpy as np
 import tensorflow as tf
 from .configuration_roberta import RobertaConfig
 from .modeling_tf_utils import TFPreTrainedModel, get_initializer
 from .file_utils import add_start_docstrings
 from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new
 logger = logging.getLogger(__name__)
 TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tf_model.h5",
    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tf_model.h5",
    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-tf_model.h5",
 }
 def load_roberta_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
    # build the network
    inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
    tf_inputs = tf.constant(inputs_list)
    tfo = tf_model(tf_inputs, training=False)
    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
 class TFRobertaEmbeddings(TFBertEmbeddings):
    """
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    """
    def __init__(self, config, **kwargs):
        super(TFRobertaEmbeddings, self).__init__(config, **kwargs)
        self.padding_idx = 1
    def _embedding(self, inputs, training=False):
        """Applies embedding based on inputs tensor."""
        input_ids, position_ids, token_type_ids = inputs
        seq_length = tf.shape(input_ids)[1]
        if position_ids is None:
            position_ids = tf.range(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=tf.int32)[tf.newaxis, :]
        return super(TFRobertaEmbeddings, self)._embedding([input_ids, position_ids, token_type_ids], training=training)
 class TFRobertaMainLayer(TFBertMainLayer):
    """
    Same as TFBertMainLayer but uses TFRobertaEmbeddings.
    """
    def __init__(self, config, **kwargs):
        super(TFRobertaMainLayer, self).__init__(config, **kwargs)
        self.embeddings = TFRobertaEmbeddings(config, name='embeddings')
    def call(self, inputs, **kwargs):
        # Check that input_ids starts with control token
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
        elif isinstance(inputs, dict):
            input_ids = inputs.get('input_ids')
        else:
            input_ids = inputs
        if tf.not_equal(tf.reduce_sum(input_ids[:, 0]), 0):
            logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
                           "This model requires special tokens in order to work. "
                           "Please specify add_special_tokens=True in your encoding.")
        return super(TFRobertaMainLayer, self).call(inputs, **kwargs)
 class TFRobertaPreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
    config_class = RobertaConfig
    pretrained_model_archive_map = TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
    load_pt_weights = load_roberta_pt_weights_in_tf2
    base_model_prefix = "roberta"
 ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
    `RoBERTa: A Robustly Optimized BERT Pretraining Approach`_
    by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
    Veselin Stoyanov. It is based on Google's BERT model released in 2018.
    It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining
    objective and training with much larger mini-batches and learning rates.
    This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained 
    models.
    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
    .. _`RoBERTa: A Robustly Optimized BERT Pretraining Approach`:
        https://arxiv.org/abs/1907.11692
    .. _`tf.keras.Model`:
        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
    Note on the model inputs:
        TF 2.0 models accepts two formats as inputs:
            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.
        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
    Parameters:
        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the 
            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 ROBERTA_INPUTS_DOCSTRING = r"""
    Inputs:
        **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            Indices of input sequence tokens in the vocabulary.
            To match pre-training, RoBERTa input sequence should be formatted with <s> and </s> tokens as follows:
            (a) For sequence pairs:
                ``tokens:         <s> Is this Jacksonville ? </s> </s> No it is not . </s>``
            (b) For single sequences:
                ``tokens:         <s> the dog is hairy . </s>``
            Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with 
            the ``add_special_tokens`` parameter set to ``True``.
            RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
            the right rather than the left.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **token_type_ids**: (`optional` need to be trained) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            Optional segment token indices to indicate first and second portions of the inputs.
            This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it
            during finetuning.
            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
            corresponds to a `sentence B` token
            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
        **position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            Indices of positions of each input sequence tokens in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1[``.
        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
@add_start_docstrings("The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
                      ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
 class TFRobertaModel(TFRobertaPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
            Sequence of hidden-states at the output of the last layer of the model.
        **pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)``
            Last layer hidden-state of the first token of the sequence (classification token)
            further processed by a Linear layer and a Tanh activation function. The Linear
            layer weights are trained from the next sentence prediction (classification)
            objective during Bert pretraining. This output is usually *not* a good summary
            of the semantic content of the input, you're often better with averaging or pooling
            the sequence of hidden-states for the whole input sequence.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import RobertaTokenizer, TFRobertaModel
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = TFRobertaModel.from_pretrained('roberta-base')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFRobertaModel, self).__init__(config, *inputs, **kwargs)
        self.roberta = TFRobertaMainLayer(config, name='roberta')
    def call(self, inputs, **kwargs):
        outputs = self.roberta(inputs, **kwargs)
        return outputs
 class TFRobertaLMHead(tf.keras.layers.Layer):
    """Roberta Head for masked language modeling."""
    def __init__(self, config, input_embeddings, **kwargs):
        super(TFRobertaLMHead, self).__init__(**kwargs)
        self.vocab_size = config.vocab_size
        self.dense = tf.keras.layers.Dense(config.hidden_size,
                                           kernel_initializer=get_initializer(config.initializer_range),
                                           name='dense')
        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
        self.act = tf.keras.layers.Activation(gelu)
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = input_embeddings
    def build(self, input_shape):
        self.bias = self.add_weight(shape=(self.vocab_size,),
                                    initializer='zeros',
                                    trainable=True,
                                    name='bias')
        super(TFRobertaLMHead, self).build(input_shape)
    def call(self, features):
        x = self.dense(features)
        x = self.act(x)
        x = self.layer_norm(x)
        # project back to size of vocabulary with bias
        x = self.decoder(x, mode="linear") + self.bias
        return x
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
 class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
    r"""
        **masked_lm_labels**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            Labels for computing the masked language modeling loss.
            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
            in ``[0, ..., config.vocab_size]``
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``tf.Tensor`` of shape ``(1,)``:
            Masked language modeling loss.
        **prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import RobertaTokenizer, TFRobertaForMaskedLM
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids, masked_lm_labels=input_ids)
        prediction_scores = outputs[0]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFRobertaForMaskedLM, self).__init__(config, *inputs, **kwargs)
        self.roberta = TFRobertaMainLayer(config, name="roberta")
        self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head")
    def call(self, inputs, **kwargs):
        outputs = self.roberta(inputs, **kwargs)
        sequence_output = outputs[0]
        prediction_scores = self.lm_head(sequence_output)
        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
        return outputs  # prediction_scores, (hidden_states), (attentions)
 class TFRobertaClassificationHead(tf.keras.layers.Layer):
    """Head for sentence-level classification tasks."""
    def __init__(self, config, **kwargs):
        super(TFRobertaClassificationHead, self).__init__(config, **kwargs)
        self.dense = tf.keras.layers.Dense(config.hidden_size,
                                           kernel_initializer=get_initializer(config.initializer_range),
                                           activation='tanh',
                                           name="dense")
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
        self.out_proj = tf.keras.layers.Dense(config.num_labels,
                                              kernel_initializer=get_initializer(config.initializer_range),
                                              name="out_proj")
    def call(self, features, training=False):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x, training=training)
        x = self.dense(x)
        x = self.dropout(x, training=training)
        x = self.out_proj(x)
        return x
@add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
    on top of the pooled output) e.g. for GLUE tasks. """,
    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
 class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
        tokenizer = RoertaTokenizer.from_pretrained('roberta-base')
        model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        labels = tf.constant([1])[None, :]  # Batch size 1
        outputs = model(input_ids)
        logits = outputs[0]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFRobertaForSequenceClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels
        self.roberta = TFRobertaMainLayer(config, name="roberta")
        self.classifier = TFRobertaClassificationHead(config, name="classifier")
    def call(self, inputs, **kwargs):
        outputs = self.roberta(inputs, **kwargs)
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output, training=kwargs.get('training', False))
        outputs = (logits,) + outputs[2:]
        return outputs  # logits, (hidden_states), (attentions)
--- a/transformers/modeling_tf_transfo_xl.py
+++ b/transformers/modeling_tf_transfo_xl.py
@@ -0,0 +1,763 @@
 # coding=utf-8
 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ TF 2.0 Transformer XL model.
 """
 from __future__ import absolute_import, division, print_function, unicode_literals
 import os
 import json
 import math
 import logging
 import collections
 import sys
 from io import open
 import numpy as np
 import tensorflow as tf
 from .configuration_transfo_xl import TransfoXLConfig
 from .modeling_tf_utils import TFPreTrainedModel, TFConv1D, TFSequenceSummary, shape_list, get_initializer
 from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
 from .file_utils import add_start_docstrings
 from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 logger = logging.getLogger(__name__)
 TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-tf_model.h5",
 }
 def load_transfo_xl_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
    # build the network
    inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
    tf_inputs = tf.constant(inputs_list)
    tfo = tf_model(tf_inputs, training=False)
    return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
 class TFPositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, demb, **kwargs):
        super(TFPositionalEmbedding, self).__init__(**kwargs)
        self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb))
    def call(self, pos_seq, bsz=None):
        sinusoid_inp = tf.einsum('i,j->ij', pos_seq, self.inv_freq)
        pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
        if bsz is not None:
            return tf.tile(pos_emb[:, None, :], [1, bsz, 1])
        else:
            return pos_emb[:, None, :]
 class TFPositionwiseFF(tf.keras.layers.Layer):
    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
        super(TFPositionwiseFF, self).__init__(**kwargs)
        self.d_model = d_model
        self.d_inner = d_inner
        self.dropout = dropout
        self.layer_1 = tf.keras.layers.Dense(d_inner,
                                             kernel_initializer=get_initializer(init_std),
                                             activation=tf.nn.relu,
                                             name='CoreNet_._0')
        self.drop_1 = tf.keras.layers.Dropout(dropout)
        self.layer_2 = tf.keras.layers.Dense(d_model,
                                             kernel_initializer=get_initializer(init_std),
                                             name='CoreNet_._3')
        self.drop_2 = tf.keras.layers.Dropout(dropout)
        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm')
        self.pre_lnorm = pre_lnorm
    def call(self, inp, training=False):
        if self.pre_lnorm:
            ##### layer normalization + positionwise feed-forward
            core_out = self.layer_norm(inp)
            core_out = self.layer_1(core_out)
            core_out = self.drop_1(core_out, training=training)
            core_out = self.layer_2(core_out)
            core_out = self.drop_2(core_out, training=training)
            ##### residual connection
            output = core_out + inp
        else:
            ##### positionwise feed-forward
            core_out = self.layer_1(inp)
            core_out = self.drop_1(core_out, training=training)
            core_out = self.layer_2(core_out)
            core_out = self.drop_2(core_out, training=training)
            ##### residual connection + layer normalization
            output = self.layer_norm(inp + core_out)
        return output
 class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
                 tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
                 r_r_bias=None, r_w_bias=None, output_attentions=False, 
                 layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
        super(TFRelPartialLearnableMultiHeadAttn, self).__init__(**kwargs)
        self.output_attentions = output_attentions
        self.n_head = n_head
        self.d_model = d_model
        self.d_head = d_head
        self.dropout = dropout
        self.qkv_net = tf.keras.layers.Dense(3 * n_head * d_head,
                                             kernel_initializer=get_initializer(init_std),
                                             use_bias=False,
                                             name='qkv_net')
        self.drop = tf.keras.layers.Dropout(dropout)
        self.dropatt = tf.keras.layers.Dropout(dropatt)
        self.o_net = tf.keras.layers.Dense(d_model,
                                           kernel_initializer=get_initializer(init_std),
                                           use_bias=False,
                                           name='o_net')
        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm')
        self.scale = 1 / (d_head ** 0.5)
        self.pre_lnorm = pre_lnorm
        if r_r_bias is not None and r_w_bias is not None: # Biases are shared
            self.r_r_bias = r_r_bias
            self.r_w_bias = r_w_bias
        else:
            self.r_r_bias = None
            self.r_w_bias = None
        self.r_net = tf.keras.layers.Dense(self.n_head * self.d_head,
                                           kernel_initializer=get_initializer(init_std),
                                           use_bias=False,
                                           name='r_net')
    def build(self, input_shape):
        if self.r_r_bias is None or self.r_w_bias is None: # Biases are not shared
            self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
                                            initializer='zeros',
                                            trainable=True,
                                            name='r_r_bias')
            self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
                                            initializer='zeros',
                                            trainable=True,
                                            name='r_w_bias')
        super(TFRelPartialLearnableMultiHeadAttn, self).build(input_shape)
    def _rel_shift(self, x):
        x_size = shape_list(x)
        x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])
        x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]])
        x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
        x = tf.reshape(x, x_size)
        return x
    def call(self, inputs, training=False):
        w, r, attn_mask, mems, head_mask = inputs
        qlen, rlen, bsz = shape_list(w)[0], shape_list(r)[0], shape_list(w)[1]
        if mems is not None:
            cat = tf.concat([mems, w], 0)
            if self.pre_lnorm:
                w_heads = self.qkv_net(self.layer_norm(cat))
            else:
                w_heads = self.qkv_net(cat)
            r_head_k = self.r_net(r)
            w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1)
            w_head_q = w_head_q[-qlen:]
        else:
            if self.pre_lnorm:
                w_heads = self.qkv_net(self.layer_norm(w))
            else:
                w_heads = self.qkv_net(w)
            r_head_k = self.r_net(r)
            w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1)
        klen = shape_list(w_head_k)[0]
        w_head_q = tf.reshape(w_head_q, (qlen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
        w_head_k = tf.reshape(w_head_k, (klen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
        w_head_v = tf.reshape(w_head_v, (klen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
        r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head))       # qlen x n_head x d_head
        #### compute attention score
        rw_head_q = w_head_q + self.r_w_bias                                    # qlen x bsz x n_head x d_head
        AC = tf.einsum('ibnd,jbnd->ijbn', rw_head_q, w_head_k)                  # qlen x klen x bsz x n_head
        rr_head_q = w_head_q + self.r_r_bias
        BD = tf.einsum('ibnd,jnd->ijbn', rr_head_q, r_head_k)                   # qlen x klen x bsz x n_head
        BD = self._rel_shift(BD)
        # [qlen x klen x bsz x n_head]
        attn_score = AC + BD
        attn_score = attn_score * self.scale
        #### compute attention probability
        if attn_mask is not None:
            attn_mask_t = attn_mask[:, :, None, None]
            attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t
        # [qlen x klen x bsz x n_head]
        attn_prob = tf.nn.softmax(attn_score, axis=1)
        attn_prob = self.dropatt(attn_prob, training=training)
        # Mask heads if we want to
        if head_mask is not None:
            attn_prob = attn_prob * head_mask
        #### compute attention vector
        attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, w_head_v)
        # [qlen x bsz x n_head x d_head]
        attn_vec_sizes = shape_list(attn_vec)
        attn_vec = tf.reshape(attn_vec, 
                        (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head))
        ##### linear projection
        attn_out = self.o_net(attn_vec)
        attn_out = self.drop(attn_out, training=training)
        if self.pre_lnorm:
            ##### residual connection
            outputs = [w + attn_out]
        else:
            ##### residual connection + layer normalization
            outputs = [self.layer_norm(w + attn_out)]
        if self.output_attentions:
            outputs.append(attn_prob)
        return outputs
 class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
    def __init__(self, n_head, d_model, d_head, d_inner, dropout,
                 tgt_len=None, ext_len=None, mem_len=None,
                 dropatt=0., pre_lnorm=False,
                 r_w_bias=None,
                 r_r_bias=None,
                 output_attentions=False,
                 layer_norm_epsilon=1e-5,
                 init_std=0.02,
                 **kwargs):
        super(TFRelPartialLearnableDecoderLayer, self).__init__(**kwargs)
        self.dec_attn = TFRelPartialLearnableMultiHeadAttn(n_head, d_model,
                            d_head, dropout, tgt_len=tgt_len, ext_len=ext_len,
                            mem_len=mem_len, dropatt=dropatt, pre_lnorm=pre_lnorm,
                            r_w_bias=r_w_bias, r_r_bias=r_r_bias, init_std=init_std,
                            output_attentions=output_attentions,
                            layer_norm_epsilon=layer_norm_epsilon, name='dec_attn')
        self.pos_ff = TFPositionwiseFF(d_model, d_inner, dropout, 
                                       pre_lnorm=pre_lnorm, init_std=init_std,
                                       layer_norm_epsilon=layer_norm_epsilon,
                                       name='pos_ff')
    def call(self, inputs, training=False):
        dec_inp, r, dec_attn_mask, mems, head_mask = inputs
        attn_outputs = self.dec_attn([dec_inp, r, dec_attn_mask,
                                      mems, head_mask], training=training)
        ff_output = self.pos_ff(attn_outputs[0], training=training)
        outputs = [ff_output] + attn_outputs[1:]
        return outputs
 class TFAdaptiveEmbedding(tf.keras.layers.Layer):
    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02,
                 sample_softmax=False, **kwargs):
        super(TFAdaptiveEmbedding, self).__init__(**kwargs)
        self.n_token = n_token
        self.d_embed = d_embed
        self.init_std = init_std
        self.cutoffs = cutoffs + [n_token]
        self.div_val = div_val
        self.d_proj = d_proj
        self.emb_scale = d_proj ** 0.5
        self.cutoff_ends = [0] + self.cutoffs
        self.emb_layers = []
        self.emb_projs = []
        if div_val == 1:
            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
        else:
            for i in range(len(self.cutoffs)):
                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
                d_emb_i = d_embed // (div_val ** i)
                self.emb_layers.append(tf.keras.layers.Embedding(r_idx-l_idx,
                                                                 d_emb_i,
                                                                 embeddings_initializer=get_initializer(init_std),
                                                                 name='emb_layers_._{}'.format(i)))
    def build(self, input_shape):
        for i in range(len(self.cutoffs)):
            d_emb_i = self.d_embed // (self.div_val ** i)
            self.emb_projs.append(self.add_weight(shape=(d_emb_i, self.d_proj),
                                                  initializer=get_initializer(self.init_std),
                                                  trainable=True,
                                                  name='emb_projs_._{}'.format(i)))
        super(TFAdaptiveEmbedding, self).build(input_shape)
    def call(self, inp):
        if self.div_val == 1:
            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
        else:
            inp_flat = tf.reshape(inp, (-1,))
            emb_flat = tf.zeros([shape_list(inp_flat)[0], self.d_proj])
            for i in range(len(self.cutoffs)):
                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
                inp_i = tf.boolean_mask(inp_flat, mask_i) - l_idx
                emb_i = self.emb_layers[i](inp_i)
                emb_i = tf.einsum('id,de->ie', emb_i, self.emb_projs[i])
                mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64)
                emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(tf.shape(emb_flat), dtype=tf.int64))
            embed_shape = shape_list(inp) + [self.d_proj]
            embed = tf.reshape(emb_flat, embed_shape)
        embed *= self.emb_scale
        return embed
 class TFTransfoXLMainLayer(tf.keras.layers.Layer):
    def __init__(self, config, **kwargs):
        super(TFTransfoXLMainLayer, self).__init__(**kwargs)
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
        self.n_token = config.n_token
        self.d_embed = config.d_embed
        self.d_model = config.d_model
        self.n_head = config.n_head
        self.d_head = config.d_head
        self.untie_r = config.untie_r
        self.word_emb = TFAdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, 
                                            div_val=config.div_val, init_std=config.init_std, name='word_emb')
        self.drop = tf.keras.layers.Dropout(config.dropout)
        self.n_layer = config.n_layer
        self.tgt_len = config.tgt_len
        self.mem_len = config.mem_len
        self.ext_len = config.ext_len
        self.max_klen = config.tgt_len + config.ext_len + config.mem_len
        self.attn_type = config.attn_type
        self.layers = []
        if config.attn_type == 0: # the default attention
            for i in range(config.n_layer):
                self.layers.append(
                    TFRelPartialLearnableDecoderLayer(
                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
                        tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
                        r_w_bias=None if self.untie_r else self.r_w_bias,
                        r_r_bias=None if self.untie_r else self.r_r_bias,
                        output_attentions=self.output_attentions,
                        layer_norm_epsilon=config.layer_norm_epsilon,
                        init_std=config.init_std,
                        name='layers_._{}'.format(i))
                )
        else: # learnable embeddings and absolute embeddings
            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
        self.same_length = config.same_length
        self.clamp_len = config.clamp_len
        if self.attn_type == 0: # default attention
            self.pos_emb = TFPositionalEmbedding(self.d_model, name='pos_emb')
        else: # learnable embeddings and absolute embeddings
            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
    def build(self, input_shape):
        if not self.untie_r:
            self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
                                            initializer='zeros',
                                            trainable=True,
                                            name='r_w_bias')
            self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
                                            initializer='zeros',
                                            trainable=True,
                                            name='r_r_bias')
        super(TFTransfoXLMainLayer, self).build(input_shape)
    def _resize_token_embeddings(self, new_num_tokens):
        return self.word_emb
    def backward_compatible(self):
        self.sample_softmax = -1
    def reset_length(self, tgt_len, ext_len, mem_len):
        self.tgt_len = tgt_len
        self.mem_len = mem_len
        self.ext_len = ext_len
    def _prune_heads(self, heads):
        raise NotImplementedError
    def init_mems(self, data):
        if self.mem_len > 0:
            mems = []
            for i in range(self.n_layer):
                empty = tf.zeros([self.mem_len, shape_list(data)[1], self.d_model])
                mems.append(empty)
            return mems
        else:
            return None
    def _update_mems(self, hids, mems, qlen, mlen):
        # does not deal with None
        if mems is None: return None
        # mems is not None
        assert len(hids) == len(mems), 'len(hids) != len(mems)'
        # There are `mlen + qlen` steps that can be cached into mems
        # For the next step, the last `ext_len` of the `qlen` tokens
        # will be used as the extended context. Hence, we only cache
        # the tokens from `mlen + qlen - self.ext_len - self.mem_len`
        # to `mlen + qlen - self.ext_len`.
        new_mems = []
        end_idx = mlen + max(0, qlen - 0 - self.ext_len)
        beg_idx = max(0, end_idx - self.mem_len)
        for i in range(len(hids)):
            cat = tf.concat([mems[i], hids[i]], axis=0)
            tf.stop_gradient(cat)
            new_mems.append(cat[beg_idx:end_idx])
        return new_mems
    def call(self, inputs, mems=None, head_mask=None, training=False):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            mems = inputs[1] if len(inputs) > 1 else mems
            head_mask = inputs[2] if len(inputs) > 2 else head_mask
            assert len(inputs) <= 3, "Too many inputs."
        elif isinstance(inputs, dict):
            input_ids = inputs.get('input_ids')
            mems = inputs.get('mems', mems)
            head_mask = inputs.get('head_mask', head_mask)
            assert len(inputs) <= 3, "Too many inputs."
        else:
            input_ids = inputs
        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
        # so we transpose here from shape [bsz, len] to shape [len, bsz]
        input_ids = tf.transpose(input_ids, perm=(1, 0))
        if mems is None:
            mems = self.init_mems(input_ids)
        qlen, bsz = shape_list(input_ids)
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
        if not head_mask is None:
            raise NotImplementedError
        else:
            head_mask = [None] * self.n_layer
        word_emb = self.word_emb(input_ids)
        mlen = shape_list(mems[0])[0] if mems is not None else 0
        klen = mlen + qlen
        attn_mask = tf.ones([qlen, qlen])
        mask_u = tf.linalg.band_part(attn_mask, 0, -1)
        mask_dia = tf.linalg.band_part(attn_mask, 0, 0)
        attn_mask_pad = tf.zeros([qlen, mlen])
        dec_attn_mask = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
        if self.same_length:
            mask_l = tf.linalg.band_part(attn_mask, -1, 0)
            dec_attn_mask = tf.concat([dec_attn_mask[:, :qlen] + mask_l - mask_dia,
                                       dec_attn_mask[:, qlen:]], 1)
        # ::: PyTorch masking code for reference :::
        # if self.same_length:
        #     all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
        #     mask_len = klen - self.mem_len
        #     if mask_len > 0:
        #         mask_shift_len = qlen - mask_len
        #     else:
        #         mask_shift_len = qlen
        #     dec_attn_mask = (torch.triu(all_ones, 1+mlen)
        #             + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1
        # else:
        #     dec_attn_mask = torch.triu(
        #         word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None]
        hids = []
        attentions = []
        if self.attn_type == 0: # default
            pos_seq = tf.range(klen-1, -1, -1.0)
            if self.clamp_len > 0:
                pos_seq = tf.minimum(pos_seq, self.clamp_len)
            pos_emb = self.pos_emb(pos_seq)
            core_out = self.drop(word_emb, training=training)
            pos_emb = self.drop(pos_emb, training=training)
            for i, layer in enumerate(self.layers):
                hids.append(core_out)
                mems_i = None if mems is None else mems[i]
                layer_outputs = layer([core_out, pos_emb, dec_attn_mask,
                                       mems_i, head_mask[i]], training=training)
                core_out = layer_outputs[0]
                if self.output_attentions:
                    attentions.append(layer_outputs[1])
        else: # learnable embeddings and absolute embeddings
            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
        core_out = self.drop(core_out, training=training)
        new_mems = self._update_mems(hids, mems, mlen, qlen)
        # We transpose back here to shape [bsz, len, hidden_dim]
        outputs = [tf.transpose(core_out, perm=(1, 0, 2)), new_mems]
        if self.output_hidden_states:
            # Add last layer and transpose to library standard shape [bsz, len, hidden_dim]
            hids.append(core_out)
            hids = list(tf.transpose(t, perm=(1, 0, 2)) for t in hids)
            outputs.append(hids)
        if self.output_attentions:
            # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
            attentions = list(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions)
            outputs.append(attentions)
        return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
 class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
    """
    config_class = TransfoXLConfig
    pretrained_model_archive_map = TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
    load_pt_weights = load_transfo_xl_pt_weights_in_tf2
    base_model_prefix = "transformer"
 TRANSFO_XL_START_DOCSTRING = r"""    The Transformer-XL model was proposed in
    `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`_
    by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
    It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse
    previously computed hidden-states to attend to longer context (memory).
    This model also uses adaptive softmax inputs and outputs (tied).
    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
    .. _`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`:
        https://arxiv.org/abs/1901.02860
    .. _`tf.keras.Model`:
        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
    Note on the model inputs:
        TF 2.0 models accepts two formats as inputs:
            - having all inputs as keyword arguments (like PyTorch models), or
            - having all inputs as a list, tuple or dict in the first positional arguments.
        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
    Parameters:
        config (:class:`~transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 TRANSFO_XL_INPUTS_DOCSTRING = r"""
    Inputs:
        **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
            Indices of input sequence tokens in the vocabulary.
            Transformer-XL is a model with relative position embeddings so you can either pad the inputs on
            the right or on the left.
            Indices can be obtained using :class:`transformers.TransfoXLTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **mems**: (`optional`)
            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
                      TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
 class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
            Sequence of hidden-states at the last layer of the model.
        **mems**:
            list of ``tf.Tensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import TransfoXLTokenizer, TFTransfoXLModel
        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
        model = TFTransfoXLModel.from_pretrained('transfo-xl-wt103')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states, mems = outputs[:2]
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFTransfoXLModel, self).__init__(config, *inputs, **kwargs)
        self.transformer = TFTransfoXLMainLayer(config, name='transformer')
    def call(self, inputs, **kwargs):
        outputs = self.transformer(inputs, **kwargs)
        return outputs
@add_start_docstrings("""The Transformer-XL Model with a language modeling head on top
    (adaptive softmax with weights tied to the adaptive input embeddings)""",
    TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
 class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
    r"""
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **prediction_scores**: ``None`` if ``lm_labels`` is provided else ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
            We don't output them when the loss is computed to speedup adaptive softmax decoding.
        **mems**:
            list of ``tf.Tensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        import tensorflow as tf
        from transformers import TransfoXLTokenizer, TFTransfoXLLMHeadModel
        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
        model = TFTransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
        outputs = model(input_ids)
        prediction_scores, mems = outputs[:2]
    """
    def __init__(self, config):
        super(TFTransfoXLLMHeadModel, self).__init__(config)
        self.transformer = TFTransfoXLMainLayer(config, name='transformer')
        self.sample_softmax = config.sample_softmax
        # use sampled softmax
        if config.sample_softmax > 0:
            raise NotImplementedError
        # use adaptive softmax (including standard softmax)
        else:
            self.crit = TFAdaptiveSoftmaxMask(config.n_token, config.d_embed, config.d_model, 
                                              config.cutoffs, div_val=config.div_val, name='crit')
    def reset_length(self, tgt_len, ext_len, mem_len):
        self.transformer.reset_length(tgt_len, ext_len, mem_len)
    def init_mems(self, data):
        return self.transformer.init_mems(data)
    def call(self, inputs, mems=None, head_mask=None, labels=None, training=False):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            mems = inputs[1] if len(inputs) > 1 else mems
            head_mask = inputs[2] if len(inputs) > 2 else head_mask
            labels = inputs[3] if len(inputs) > 3 else labels
            assert len(inputs) <= 4, "Too many inputs."
        elif isinstance(inputs, dict):
            input_ids = inputs.get('input_ids')
            mems = inputs.get('mems', mems)
            head_mask = inputs.get('head_mask', head_mask)
            labels = inputs.get('labels', labels)
            assert len(inputs) <= 4, "Too many inputs."
        else:
            input_ids = inputs
        bsz, tgt_len = shape_list(input_ids)[:2]
        transformer_outputs = self.transformer([input_ids, mems, head_mask], training=training)
        last_hidden = transformer_outputs[0]
        pred_hid = last_hidden[:, -tgt_len:]
        outputs = transformer_outputs[1:]
        if self.sample_softmax > 0 and training:
            raise NotImplementedError
        else:
            # pred_hid = tf.reshape(pred_hid, (-1, shape_list(pred_hid)[-1]))
            softmax_output = self.crit([pred_hid, labels], training=training)
            # softmax_output = tf.reshape(softmax_output, (bsz, tgt_len, -1))
            outputs = [softmax_output] + outputs
        return outputs  # logits, new_mems, (all hidden states), (all attentions)
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,3 @@`
							`from .utils import InputExample, InputFeatures, DataProcessor`
							`from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features`