diff --git a/.circleci/config.yml b/.circleci/config.yml
index 820e8951e5..1bb3fd0877 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,7 +1,7 @@
 version: 2
 jobs:
     build_py3_torch_and_tf:
-        working_directory: ~/pytorch-transformers
+        working_directory: ~/transformers
         docker:
             - image: circleci/python:3.5
         resource_class: xlarge
@@ -13,10 +13,10 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install tensorboardX scikit-learn
-            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
+            - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     build_py3_torch:
-        working_directory: ~/pytorch-transformers
+        working_directory: ~/transformers
         docker:
             - image: circleci/python:3.5
         resource_class: xlarge
@@ -27,11 +27,11 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install tensorboardX scikit-learn
-            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
+            - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: python -m pytest -sv ./examples/
             - run: codecov
     build_py3_tf:
-        working_directory: ~/pytorch-transformers
+        working_directory: ~/transformers
         docker:
             - image: circleci/python:3.5
         resource_class: xlarge
@@ -42,10 +42,10 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install tensorboardX scikit-learn
-            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
+            - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     build_py2_torch:
-        working_directory: ~/pytorch-transformers
+        working_directory: ~/transformers
         resource_class: large
         parallelism: 1
         docker:
@@ -55,10 +55,10 @@ jobs:
             - run: sudo pip install torch
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
-            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
+            - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     build_py2_tf:
-        working_directory: ~/pytorch-transformers
+        working_directory: ~/transformers
         resource_class: large
         parallelism: 1
         docker:
@@ -68,10 +68,10 @@ jobs:
             - run: sudo pip install tensorflow==2.0.0-rc0
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
-            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
+            - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     deploy_doc:
-        working_directory: ~/pytorch-transformers
+        working_directory: ~/transformers
         docker:
             - image: circleci/python:3.5
         steps:
diff --git a/.coveragerc b/.coveragerc
index fa6c165a8a..9a1103b8af 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,5 @@
 [run]
-source=pytorch_transformers
+source=transformers
 omit =
     # skip convertion scripts from testing for now
     */convert_*
diff --git a/.github/ISSUE_TEMPLATE/migration.md b/.github/ISSUE_TEMPLATE/migration.md
index cf0c9a4757..8ce1bc8fdd 100644
--- a/.github/ISSUE_TEMPLATE/migration.md
+++ b/.github/ISSUE_TEMPLATE/migration.md
@@ -1,6 +1,6 @@
 ---
 name: "\U0001F4DA Migration from PyTorch-pretrained-Bert"
-about: Report a problem when migrating from PyTorch-pretrained-Bert to PyTorch-Transformers
+about: Report a problem when migrating from PyTorch-pretrained-Bert to Transformers
 ---
 
 ## 📚 Migration
diff --git a/README.md b/README.md
index 9187250c19..8acdc8eb09 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
-# 👾 PyTorch-Transformers
+# 🤗 Transformers
 
-[![CircleCI](https://circleci.com/gh/huggingface/pytorch-transformers.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-transformers)
+[![CircleCI](https://circleci.com/gh/huggingface/transformers.svg?style=svg)](https://circleci.com/gh/huggingface/transformers)
 
-PyTorch-Transformers (formerly known as `pytorch-pretrained-bert`) is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
+Transformers (formerly known as `pytorch-pretrained-bert`) is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
 
 The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
 
@@ -13,10 +13,10 @@ The library currently contains PyTorch implementations, pre-trained model weight
 5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. **[DistilBERT](https://github.com/huggingface/pytorch-transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5
+8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5
 ) by Victor Sanh, Lysandre Debut and Thomas Wolf.
 
-These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/pytorch-transformers/examples.html).
+These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
 
 | Section | Description |
 |-|-|
@@ -24,8 +24,8 @@ These implementations have been tested on several datasets (see the example scri
 | [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
 | [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
 | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
-| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers |
-| [Documentation](https://huggingface.co/pytorch-transformers/) | Full API documentation and more |
+| [Migrating from pytorch-pretrained-bert to transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
+| [Documentation](https://huggingface.co/transformers/) | Full API documentation and more |
 
 ## Installation
 
@@ -33,10 +33,10 @@ This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3
 
 ### With pip
 
-PyTorch-Transformers can be installed by pip as follows:
+Transformers can be installed by pip as follows:
 
 ```bash
-pip install pytorch-transformers
+pip install transformers
 ```
 
 ### From source
@@ -49,14 +49,14 @@ pip install [--editable] .
 
 ### Tests
 
-A series of tests is included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/pytorch-transformers/tree/master/examples).
+A series of tests is included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
 
 These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
 
 You can run the tests from the root of the cloned repository with the commands:
 
 ```bash
-python -m pytest -sv ./pytorch_transformers/tests/
+python -m pytest -sv ./transformers/tests/
 python -m pytest -sv ./examples/
 ```
 
@@ -80,13 +80,13 @@ You can use it to experiment with completions generated by `GPT2Model`, `Transfo
 
 ## Quick tour
 
-Let's do a very quick overview of PyTorch-Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/pytorch-transformers/).
+Let's do a very quick overview of Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/transformers/).
 
 ```python
 import torch
-from pytorch_transformers import *
+from transformers import *
 
-# PyTorch-Transformers has a unified API
+# Transformers has a unified API
 # for 7 transformer architectures and 30 pretrained weights.
 #          Model          | Tokenizer          | Pretrained weights shortcut
 MODELS = [(BertModel,       BertTokenizer,      'bert-base-uncased'),
@@ -299,19 +299,19 @@ python ./examples/run_generation.py \
     --model_name_or_path=gpt2 \
 ```
 
-## Migrating from pytorch-pretrained-bert to pytorch-transformers
+## Migrating from pytorch-pretrained-bert to transformers
 
-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers`
+Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`
 
 ### Models always output `tuples`
 
-The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
 
-The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
+The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
 
 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
 
-Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model:
+Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
 
 ```python
 # Let's load our model
@@ -320,11 +320,11 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 # If you used to have this line in pytorch-pretrained-bert:
 loss = model(input_ids, labels=labels)
 
-# Now just use this line in pytorch-transformers to extract the loss from the output tuple:
+# Now just use this line in transformers to extract the loss from the output tuple:
 outputs = model(input_ids, labels=labels)
 loss = outputs[0]
 
-# In pytorch-transformers you can also have access to the logits:
+# In transformers you can also have access to the logits:
 loss, logits = outputs[:2]
 
 # And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
@@ -339,7 +339,7 @@ Breaking change in the `from_pretrained()`method:
 
 1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
 
-2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/pytorch-transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
+2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
 
 Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
 
@@ -396,7 +396,7 @@ for batch in train_data:
     loss.backward()
     optimizer.step()
 
-### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
+### In Transformers, optimizer and schedules are splitted and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
 scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
 ### and used like this:
@@ -411,4 +411,4 @@ for batch in train_data:
 
 ## Citation
 
-At the moment, there is no paper associated to PyTorch-Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
+At the moment, there is no paper associated to Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 1a6c6f06f9..fed834ff88 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -2,6 +2,6 @@ FROM pytorch/pytorch:latest
 
 RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
 
-RUN pip install pytorch_transformers
+RUN pip install transformers
 
 WORKDIR /workspace
\ No newline at end of file
diff --git a/docs/source/_static/js/custom.js b/docs/source/_static/js/custom.js
index 4adf2a4672..1d5827c0fa 100644
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -16,7 +16,7 @@ function addIcon() {
 function addCustomFooter() {
     const customFooter = document.createElement("div");
     const questionOrIssue = document.createElement("div");
-    questionOrIssue.innerHTML = "Stuck? Read our <a href='https://medium.com/huggingface'>Blog posts</a> or <a href='https://github.com/huggingface/pytorch_transformers'>Create an issue</a>";
+    questionOrIssue.innerHTML = "Stuck? Read our <a href='https://medium.com/huggingface'>Blog posts</a> or <a href='https://github.com/huggingface/transformers'>Create an issue</a>";
     customFooter.appendChild(questionOrIssue);
     customFooter.classList.add("footer");
 
diff --git a/docs/source/bertology.rst b/docs/source/bertology.rst
index 0ea5b53d80..c3d1b2f8b8 100644
--- a/docs/source/bertology.rst
+++ b/docs/source/bertology.rst
@@ -15,4 +15,4 @@ In order to help this new field develop, we have included a few additional featu
 * accessing all the attention weights for each head of BERT/GPT/GPT-2,
 * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
 
-To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
+To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index c847dee806..ae9cc67e2c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -19,7 +19,7 @@ sys.path.insert(0, os.path.abspath('../..'))
 
 # -- Project information -----------------------------------------------------
 
-project = u'pytorch-transformers'
+project = u'transformers'
 copyright = u'2019, huggingface'
 author = u'huggingface'
 
@@ -109,7 +109,7 @@ html_static_path = ['_static']
 # -- Options for HTMLHelp output ---------------------------------------------
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'pytorch-transformersdoc'
+htmlhelp_basename = 'transformersdoc'
 
 
 # -- Options for LaTeX output ------------------------------------------------
@@ -136,7 +136,7 @@ latex_elements = {
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'pytorch-transformers.tex', u'pytorch-transformers Documentation',
+    (master_doc, 'transformers.tex', u'transformers Documentation',
      u'huggingface', 'manual'),
 ]
 
@@ -146,7 +146,7 @@ latex_documents = [
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    (master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
+    (master_doc, 'transformers', u'transformers Documentation',
      [author], 1)
 ]
 
@@ -157,8 +157,8 @@ man_pages = [
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
-     author, 'pytorch-transformers', 'One line description of project.',
+    (master_doc, 'transformers', u'transformers Documentation',
+     author, 'transformers', 'One line description of project.',
      'Miscellaneous'),
 ]
 
diff --git a/docs/source/converting_tensorflow_models.rst b/docs/source/converting_tensorflow_models.rst
index 8441c9b1f7..8d805b0ed1 100644
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -6,7 +6,7 @@ A command-line interface is provided to convert original Bert/GPT/GPT-2/Transfor
 BERT
 ^^^^
 
-You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
+You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
 
 This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).
 
@@ -20,7 +20,7 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas
 
    export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
 
-   pytorch_transformers bert \
+   transformers bert \
      $BERT_BASE_DIR/bert_model.ckpt \
      $BERT_BASE_DIR/bert_config.json \
      $BERT_BASE_DIR/pytorch_model.bin
@@ -36,7 +36,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
 
    export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
 
-   pytorch_transformers gpt \
+   transformers gpt \
      $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
      $PYTORCH_DUMP_OUTPUT \
      [OPENAI_GPT_CONFIG]
@@ -50,7 +50,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT-2 mode
 
    export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
 
-   pytorch_transformers gpt2 \
+   transformers gpt2 \
      $OPENAI_GPT2_CHECKPOINT_PATH \
      $PYTORCH_DUMP_OUTPUT \
      [OPENAI_GPT2_CONFIG]
@@ -64,7 +64,7 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
 
    export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
 
-   pytorch_transformers transfo_xl \
+   transformers transfo_xl \
      $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
      $PYTORCH_DUMP_OUTPUT \
      [TRANSFO_XL_CONFIG]
@@ -80,7 +80,7 @@ Here is an example of the conversion process for a pre-trained XLNet model, fine
    export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
    export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
 
-   pytorch_transformers xlnet \
+   transformers xlnet \
      $TRANSFO_XL_CHECKPOINT_PATH \
      $TRANSFO_XL_CONFIG_PATH \
      $PYTORCH_DUMP_OUTPUT \
@@ -96,6 +96,6 @@ Here is an example of the conversion process for a pre-trained XLM model:
 
    export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
 
-   pytorch_transformers xlm \
+   transformers xlm \
      $XLM_CHECKPOINT_PATH \
      $PYTORCH_DUMP_OUTPUT \
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5b451707d6..a205b0b314 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,7 +1,7 @@
-Pytorch-Transformers
+Transformers
 ================================================================================================================================================
 
-PyTorch-Transformers is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
+Transformers is a library of state-of-the-art pre-trained models for Natural Language Processing (NLP).
 
 The library currently contains PyTorch implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
 
@@ -12,7 +12,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
 5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
 7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. `DistilBERT <https://huggingface.co/pytorch-transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
+8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
 
 .. toctree::
     :maxdepth: 2
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 6512a0cef3..51f7eb520d 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -1,7 +1,7 @@
 Installation
 ================================================
 
-PyTorch-Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
+Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
 
 With pip
 ^^^^^^^^
@@ -10,7 +10,7 @@ PyTorch Transformers can be installed using pip as follows:
 
 .. code-block:: bash
 
-   pip install pytorch-transformers
+   pip install transformers
 
 From source
 ^^^^^^^^^^^
@@ -19,15 +19,15 @@ To install from source, clone the repository and install with:
 
 .. code-block:: bash
 
-    git clone https://github.com/huggingface/pytorch-transformers.git
-    cd pytorch-transformers
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
     pip install [--editable] .
 
 
 Tests
 ^^^^^
 
-An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`_.
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/transformers/tree/master/transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/transformers/tree/master/examples>`_.
 
 Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
 
@@ -35,7 +35,7 @@ Run all the tests from the root of the cloned repository with the commands:
 
 .. code-block:: bash
 
-    python -m pytest -sv ./pytorch_transformers/tests/
+    python -m pytest -sv ./transformers/tests/
     python -m pytest -sv ./examples/
 
 
diff --git a/docs/source/main_classes/configuration.rst b/docs/source/main_classes/configuration.rst
index 5181874c1a..2131433759 100644
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@@ -6,5 +6,5 @@ The base class ``PretrainedConfig`` implements the common methods for loading/sa
 ``PretrainedConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.PretrainedConfig
+.. autoclass:: transformers.PretrainedConfig
     :members:
diff --git a/docs/source/main_classes/model.rst b/docs/source/main_classes/model.rst
index ba61afadf0..d22467f907 100644
--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@@ -11,5 +11,5 @@ The base class ``PreTrainedModel`` implements the common methods for loading/sav
 ``PreTrainedModel``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.PreTrainedModel
+.. autoclass:: transformers.PreTrainedModel
     :members:
diff --git a/docs/source/main_classes/optimizer_schedules.rst b/docs/source/main_classes/optimizer_schedules.rst
index 70fefb7c6d..ff0c9e6929 100644
--- a/docs/source/main_classes/optimizer_schedules.rst
+++ b/docs/source/main_classes/optimizer_schedules.rst
@@ -9,7 +9,7 @@ The ``.optimization`` module provides:
 ``AdamW``
 ~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.AdamW
+.. autoclass:: transformers.AdamW
     :members:
 
 Schedules
@@ -18,11 +18,11 @@ Schedules
 Learning Rate Schedules
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-.. autoclass:: pytorch_transformers.ConstantLRSchedule
+.. autoclass:: transformers.ConstantLRSchedule
     :members:
 
 
-.. autoclass:: pytorch_transformers.WarmupConstantSchedule
+.. autoclass:: transformers.WarmupConstantSchedule
     :members:
 
 .. image:: /imgs/warmup_constant_schedule.png
@@ -30,7 +30,7 @@ Learning Rate Schedules
     :alt:
 
 
-.. autoclass:: pytorch_transformers.WarmupCosineSchedule
+.. autoclass:: transformers.WarmupCosineSchedule
     :members:
 
 .. image:: /imgs/warmup_cosine_schedule.png
@@ -38,7 +38,7 @@ Learning Rate Schedules
     :alt:
 
 
-.. autoclass:: pytorch_transformers.WarmupCosineWithHardRestartsSchedule
+.. autoclass:: transformers.WarmupCosineWithHardRestartsSchedule
     :members:
 
 .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
@@ -47,7 +47,7 @@ Learning Rate Schedules
 
 
 
-.. autoclass:: pytorch_transformers.WarmupLinearSchedule
+.. autoclass:: transformers.WarmupLinearSchedule
     :members:
 
 .. image:: /imgs/warmup_linear_schedule.png
diff --git a/docs/source/main_classes/tokenizer.rst b/docs/source/main_classes/tokenizer.rst
index 12ca5522de..c33eb45829 100644
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -12,5 +12,5 @@ The base class ``PreTrainedTokenizer`` implements the common methods for loading
 ``PreTrainedTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.PreTrainedTokenizer
+.. autoclass:: transformers.PreTrainedTokenizer
     :members:
diff --git a/docs/source/migration.md b/docs/source/migration.md
index 9cfcaade13..553a79c82b 100644
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -1,17 +1,17 @@
 # Migrating from pytorch-pretrained-bert
 
 
-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `pytorch-transformers`
+Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`
 
 ### Models always output `tuples`
 
-The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
 
-The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
+The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
 
 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
 
-Here is a `pytorch-pretrained-bert` to `pytorch-transformers` conversion example for a `BertForSequenceClassification` classification model:
+Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
 
 ```python
 # Let's load our model
@@ -20,11 +20,11 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 # If you used to have this line in pytorch-pretrained-bert:
 loss = model(input_ids, labels=labels)
 
-# Now just use this line in pytorch-transformers to extract the loss from the output tuple:
+# Now just use this line in transformers to extract the loss from the output tuple:
 outputs = model(input_ids, labels=labels)
 loss = outputs[0]
 
-# In pytorch-transformers you can also have access to the logits:
+# In transformers you can also have access to the logits:
 loss, logits = outputs[:2]
 
 # And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
@@ -96,7 +96,7 @@ for batch in train_data:
     loss.backward()
     optimizer.step()
 
-### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
+### In Transformers, optimizer and schedules are splitted and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
 scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
 ### and used like this:
diff --git a/docs/source/model_doc/auto.rst b/docs/source/model_doc/auto.rst
index 7b56eabafe..4b900d8e55 100644
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -11,19 +11,19 @@ Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will di
 ``AutoConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.AutoConfig
+.. autoclass:: transformers.AutoConfig
     :members:
 
 
 ``AutoModel``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.AutoModel
+.. autoclass:: transformers.AutoModel
     :members:
 
 
 ``AutoTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.AutoTokenizer
+.. autoclass:: transformers.AutoTokenizer
     :members:
diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst
index cbce74e73b..9f2caf3e80 100644
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -4,69 +4,69 @@ BERT
 ``BertConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertConfig
+.. autoclass:: transformers.BertConfig
     :members:
 
 
 ``BertTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertTokenizer
+.. autoclass:: transformers.BertTokenizer
     :members:
 
 
 ``BertModel``
 ~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertModel
+.. autoclass:: transformers.BertModel
     :members:
 
 
 ``BertForPreTraining``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertForPreTraining
+.. autoclass:: transformers.BertForPreTraining
     :members:
 
 
 ``BertForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertForMaskedLM
+.. autoclass:: transformers.BertForMaskedLM
     :members:
 
 
 ``BertForNextSentencePrediction``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertForNextSentencePrediction
+.. autoclass:: transformers.BertForNextSentencePrediction
     :members:
 
 
 ``BertForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertForSequenceClassification
+.. autoclass:: transformers.BertForSequenceClassification
     :members:
 
 
 ``BertForMultipleChoice``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertForMultipleChoice
+.. autoclass:: transformers.BertForMultipleChoice
     :members:
 
 
 ``BertForTokenClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertForTokenClassification
+.. autoclass:: transformers.BertForTokenClassification
     :members:
 
 
 ``BertForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.BertForQuestionAnswering
+.. autoclass:: transformers.BertForQuestionAnswering
     :members:
 
diff --git a/docs/source/model_doc/distilbert.rst b/docs/source/model_doc/distilbert.rst
index 141d3e151f..de1ac73675 100644
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -4,40 +4,40 @@ DistilBERT
 ``DistilBertConfig``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.DistilBertConfig
+.. autoclass:: transformers.DistilBertConfig
     :members:
 
 
 ``DistilBertTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.DistilBertTokenizer
+.. autoclass:: transformers.DistilBertTokenizer
     :members:
 
 
 ``DistilBertModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.DistilBertModel
+.. autoclass:: transformers.DistilBertModel
     :members:
 
 
 ``DistilBertForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.DistilBertForMaskedLM
+.. autoclass:: transformers.DistilBertForMaskedLM
     :members:
 
 
 ``DistilBertForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.DistilBertForSequenceClassification
+.. autoclass:: transformers.DistilBertForSequenceClassification
     :members:
 
 
 ``DistilBertForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.DistilBertForQuestionAnswering
+.. autoclass:: transformers.DistilBertForQuestionAnswering
     :members:
diff --git a/docs/source/model_doc/gpt.rst b/docs/source/model_doc/gpt.rst
index 26762ae011..39995a98fc 100644
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -4,33 +4,33 @@ OpenAI GPT
 ``OpenAIGPTConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.OpenAIGPTConfig
+.. autoclass:: transformers.OpenAIGPTConfig
     :members:
 
 
 ``OpenAIGPTTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.OpenAIGPTTokenizer
+.. autoclass:: transformers.OpenAIGPTTokenizer
     :members:
 
 
 ``OpenAIGPTModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.OpenAIGPTModel
+.. autoclass:: transformers.OpenAIGPTModel
     :members:
 
 
 ``OpenAIGPTLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.OpenAIGPTLMHeadModel
+.. autoclass:: transformers.OpenAIGPTLMHeadModel
     :members:
 
 
 ``OpenAIGPTDoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.OpenAIGPTDoubleHeadsModel
+.. autoclass:: transformers.OpenAIGPTDoubleHeadsModel
     :members:
diff --git a/docs/source/model_doc/gpt2.rst b/docs/source/model_doc/gpt2.rst
index a49d1b4258..92decb14de 100644
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -4,33 +4,33 @@ OpenAI GPT2
 ``GPT2Config``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.GPT2Config
+.. autoclass:: transformers.GPT2Config
     :members:
 
 
 ``GPT2Tokenizer``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.GPT2Tokenizer
+.. autoclass:: transformers.GPT2Tokenizer
     :members:
 
 
 ``GPT2Model``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.GPT2Model
+.. autoclass:: transformers.GPT2Model
     :members:
 
 
 ``GPT2LMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.GPT2LMHeadModel
+.. autoclass:: transformers.GPT2LMHeadModel
     :members:
 
 
 ``GPT2DoubleHeadsModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.GPT2DoubleHeadsModel
+.. autoclass:: transformers.GPT2DoubleHeadsModel
     :members:
diff --git a/docs/source/model_doc/roberta.rst b/docs/source/model_doc/roberta.rst
index e2de917e35..5351e018cd 100644
--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@@ -4,33 +4,33 @@ RoBERTa
 ``RobertaConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.RobertaConfig
+.. autoclass:: transformers.RobertaConfig
     :members:
 
 
 ``RobertaTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.RobertaTokenizer
+.. autoclass:: transformers.RobertaTokenizer
     :members:
 
 
 ``RobertaModel``
 ~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.RobertaModel
+.. autoclass:: transformers.RobertaModel
     :members:
 
 
 ``RobertaForMaskedLM``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.RobertaForMaskedLM
+.. autoclass:: transformers.RobertaForMaskedLM
     :members:
 
 
 ``RobertaForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.RobertaForSequenceClassification
+.. autoclass:: transformers.RobertaForSequenceClassification
     :members:
diff --git a/docs/source/model_doc/transformerxl.rst b/docs/source/model_doc/transformerxl.rst
index 88cca450ee..c8a9cc7d99 100644
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -5,26 +5,26 @@ Transformer XL
 ``TransfoXLConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.TransfoXLConfig
+.. autoclass:: transformers.TransfoXLConfig
     :members:
 
 
 ``TransfoXLTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.TransfoXLTokenizer
+.. autoclass:: transformers.TransfoXLTokenizer
     :members:
 
 
 ``TransfoXLModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.TransfoXLModel
+.. autoclass:: transformers.TransfoXLModel
     :members:
 
 
 ``TransfoXLLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.TransfoXLLMHeadModel
+.. autoclass:: transformers.TransfoXLLMHeadModel
     :members:
diff --git a/docs/source/model_doc/xlm.rst b/docs/source/model_doc/xlm.rst
index 217952ea5e..344371ad52 100644
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -4,38 +4,38 @@ XLM
 ``XLMConfig``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLMConfig
+.. autoclass:: transformers.XLMConfig
     :members:
 
 ``XLMTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLMTokenizer
+.. autoclass:: transformers.XLMTokenizer
     :members:
 
 ``XLMModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLMModel
+.. autoclass:: transformers.XLMModel
     :members:
 
 
 ``XLMWithLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLMWithLMHeadModel
+.. autoclass:: transformers.XLMWithLMHeadModel
     :members:
 
 
 ``XLMForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLMForSequenceClassification
+.. autoclass:: transformers.XLMForSequenceClassification
     :members:
 
 
 ``XLMForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLMForQuestionAnswering
+.. autoclass:: transformers.XLMForQuestionAnswering
     :members:
diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst
index e388934c56..a471506278 100644
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -4,40 +4,40 @@ XLNet
 ``XLNetConfig``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLNetConfig
+.. autoclass:: transformers.XLNetConfig
     :members:
 
 
 ``XLNetTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLNetTokenizer
+.. autoclass:: transformers.XLNetTokenizer
     :members:
 
 
 ``XLNetModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLNetModel
+.. autoclass:: transformers.XLNetModel
     :members:
 
 
 ``XLNetLMHeadModel``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLNetLMHeadModel
+.. autoclass:: transformers.XLNetLMHeadModel
     :members:
 
 
 ``XLNetForSequenceClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLNetForSequenceClassification
+.. autoclass:: transformers.XLNetForSequenceClassification
     :members:
 
 
 ``XLNetForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: pytorch_transformers.XLNetForQuestionAnswering
+.. autoclass:: transformers.XLNetForQuestionAnswering
     :members:
diff --git a/docs/source/notebooks.rst b/docs/source/notebooks.rst
index 7e214fa00a..fe669e8e47 100644
--- a/docs/source/notebooks.rst
+++ b/docs/source/notebooks.rst
@@ -1,16 +1,16 @@
 Notebooks
 ================================================
 
-We include `three Jupyter Notebooks <https://github.com/huggingface/pytorch-transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
+We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
 
 
 *
-  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
+  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
 
 *
-  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
+  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
 
 *
-  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/pytorch-transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
+  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
 
 Please follow the instructions given in the notebooks to run and modify them.
diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index d6e273797f..0e55767d76 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -44,15 +44,15 @@ Here is the full list of the currently provided pretrained models together with
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
 |                   |                                                            | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                                                             |
-|                   |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`__).   |
+|                   |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/transformers/tree/master/examples>`__).   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-cased-whole-word-masking-finetuned-squad``    | | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                                    |
 |                   |                                                            | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                               |
-|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)                   |
+|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-cased-finetuned-mrpc``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | The ``bert-base-cased`` model fine-tuned on MRPC                                                                                    |
-|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)                   |
+|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                   |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | OpenAI GPT English model                                                                                                            |
@@ -120,4 +120,4 @@ Here is the full list of the currently provided pretrained models together with
 |                   |                                                            | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__)                                                            |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 
-.. <https://huggingface.co/pytorch-transformers/examples.html>`__
\ No newline at end of file
+.. <https://huggingface.co/transformers/examples.html>`__
\ No newline at end of file
diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md
index f037a95a3a..c7ad5c7930 100644
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -2,7 +2,7 @@
 
 ## Philosophy
 
-PyTorch-Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
+Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
 
 The library was designed with two strong goals in mind:
 
@@ -39,7 +39,7 @@ The library is build around three type of classes for each models:
 
 All these classes can be instantiated from pretrained instances and saved locally using two methods:
 
-- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/pytorch-transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
+- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
 - `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.
 
 We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts:
@@ -59,7 +59,7 @@ Let's start by preparing a tokenized input (a list of token embeddings indices t
 
 ```python
 import torch
-from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
+from transformers import BertTokenizer, BertModel, BertForMaskedLM
 
 # OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
 import logging
@@ -106,7 +106,7 @@ model.to('cuda')
 with torch.no_grad():
     # See the models docstrings for the detail of the inputs
     outputs = model(tokens_tensor, token_type_ids=segments_tensors)
-    # PyTorch-Transformers models always output tuples.
+    # Transformers models always output tuples.
     # See the models docstrings for the detail of all the outputs
     # In our case, the first element is the hidden state of the last layer of the Bert model
     encoded_layers = outputs[0]
@@ -145,7 +145,7 @@ First let's prepare a tokenized input from our text string using `GPT2Tokenizer`
 
 ```python
 import torch
-from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
 
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst
index 7117d7ffa6..0b0b600ec1 100644
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -45,7 +45,7 @@ where
     * ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
     * ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
 
-  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
+  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
 
 *
   ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
@@ -122,7 +122,7 @@ Here is the recommended way of saving the model, configuration and vocabulary to
 
 .. code-block:: python
 
-   from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+   from transformers import WEIGHTS_NAME, CONFIG_NAME
 
    output_dir = "./models/"
 
diff --git a/docs/source/torchscript.rst b/docs/source/torchscript.rst
index 5811572c07..fd1eeb5363 100644
--- a/docs/source/torchscript.rst
+++ b/docs/source/torchscript.rst
@@ -12,7 +12,7 @@ According to Pytorch's documentation: "TorchScript is a way to create serializab
 Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
 their model to be re-used in other programs, such as efficiency-oriented C++ programs.
 
-We have provided an interface that allows the export of `pytorch-transformers` models to TorchScript so that they can
+We have provided an interface that allows the export of `transformers` models to TorchScript so that they can
 be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that
 they can be exported, and what to be mindful of when using these models with TorchScript.
 
@@ -74,7 +74,7 @@ according to a ``BertConfig`` class and then saved to disk under the filename ``
 
 .. code-block:: python
 
-    from pytorch_transformers import BertModel, BertTokenizer, BertConfig
+    from transformers import BertModel, BertTokenizer, BertConfig
     import torch
 
     enc = BertTokenizer.from_pretrained("bert-base-uncased")
diff --git a/examples/README.md b/examples/README.md
index 3253e5481c..85c4c568f6 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -13,7 +13,7 @@ similar API between the different models.
 
 ## Language model fine-tuning
 
-Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_lm_finetuning.py).
+Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_lm_finetuning.py).
 
 Fine-tuning the library models for language modeling on a text dataset for GPT, GPT-2, BERT and RoBERTa (DistilBERT 
 to be added soon). GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa 
@@ -75,7 +75,7 @@ python run_lm_finetuning.py \
 
 ## Language generation
 
-Based on the script [`run_generation.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_generation.py).
+Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py).
 
 Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
 A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
@@ -91,7 +91,7 @@ python run_generation.py \
 
 ## GLUE
 
-Based on the script [`run_glue.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py).
+Based on the script [`run_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_glue.py).
 
 Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding 
 Evaluation](https://gluebenchmark.com/). This script can fine-tune the following models: BERT, XLM, XLNet and RoBERTa. 
@@ -319,7 +319,7 @@ eval_loss = 0.44457291918821606
 
 ## SQuAD
 
-Based on the script [`run_squad.py`](https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py).
+Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py).
 
 #### Fine-tuning on SQuAD
 
diff --git a/examples/contrib/run_openai_gpt.py b/examples/contrib/run_openai_gpt.py
index 1c9fba8ee8..661c1c305b 100644
--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@@ -39,7 +39,7 @@ import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 
-from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
+from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
                                      AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME,
                                      WarmupLinearSchedule)
 
diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py
index 495f40cec9..58aec25877 100644
--- a/examples/contrib/run_swag.py
+++ b/examples/contrib/run_swag.py
@@ -35,10 +35,10 @@ from tqdm import tqdm, trange
 
 from tensorboardX import SummaryWriter
 
-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                   BertForMultipleChoice, BertTokenizer)
 
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
 
 logger = logging.getLogger(__name__)
 
@@ -365,7 +365,7 @@ def train(args, train_dataset, model, tokenizer):
             #     inputs.update({'cls_index': batch[5],
             #                    'p_mask':       batch[6]})
             outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
                 loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
@@ -647,7 +647,7 @@ def main():
 
         if args.eval_all_checkpoints:
             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
 
diff --git a/examples/contrib/run_transfo_xl.py b/examples/contrib/run_transfo_xl.py
index 4c99777b98..f5375269b8 100644
--- a/examples/contrib/run_transfo_xl.py
+++ b/examples/contrib/run_transfo_xl.py
@@ -28,7 +28,7 @@ import math
 
 import torch
 
-from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
+from transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
diff --git a/examples/distillation/README.md b/examples/distillation/README.md
index 73e0cc0655..ad4f8989b7 100644
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -13,11 +13,11 @@ For more information on DistilBERT, please refer to our [detailed blog post](htt
 
 This part of the library has only be tested with Python3.6+. There are few specific dependencies to install before launching a distillation, you can install them with the command `pip install -r requirements.txt`. 
 
-**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/pytorch-transformers/issues/1179) for more details.
+**Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0). It is important to note that there is a small internal bug in the current version of PyTorch available on pip that causes a memory leak in our training/distillation. It has been recently fixed and will likely be integrated into the next release. For the moment, we recommend to [compile PyTorch from source](https://github.com/pytorch/pytorch#from-source). Please refer to [issue 1179](https://github.com/huggingface/transformers/issues/1179) for more details.
 
 ## How to use DistilBERT
 
-PyTorch-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
+Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
 
 - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
 - `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.2 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py
index 93135e292c..1bfda325dc 100644
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -26,7 +26,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
 
 from utils import logger
 from dataset import Dataset
diff --git a/examples/distillation/scripts/binarized_data.py b/examples/distillation/scripts/binarized_data.py
index 51be8fd0be..e98662fdc8 100644
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -20,7 +20,7 @@ import pickle
 import random
 import time
 import numpy as np
-from pytorch_transformers import BertTokenizer
+from transformers import BertTokenizer
 import logging
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
diff --git a/examples/distillation/scripts/extract_for_distil.py b/examples/distillation/scripts/extract_for_distil.py
index f3eee024ec..1b9e20c382 100644
--- a/examples/distillation/scripts/extract_for_distil.py
+++ b/examples/distillation/scripts/extract_for_distil.py
@@ -15,7 +15,7 @@
 """
 Preprocessing script before training DistilBERT.
 """
-from pytorch_transformers import BertForPreTraining
+from transformers import BertForPreTraining
 import torch
 import argparse
 
diff --git a/examples/distillation/train.py b/examples/distillation/train.py
index 712f10b47d..6060cd9eb6 100644
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@@ -23,8 +23,8 @@ import shutil
 import numpy as np
 import torch
 
-from pytorch_transformers import BertTokenizer, BertForMaskedLM
-from pytorch_transformers import DistilBertForMaskedLM, DistilBertConfig
+from transformers import BertTokenizer, BertForMaskedLM
+from transformers import DistilBertForMaskedLM, DistilBertConfig
 
 from distiller import Distiller
 from utils import git_log, logger, init_gpu_params, set_seed
diff --git a/examples/run_bertology.py b/examples/run_bertology.py
index f11b73b54f..f37358359d 100644
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -32,7 +32,7 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse
 from torch.utils.data.distributed import DistributedSampler
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from pytorch_transformers import (WEIGHTS_NAME,
+from transformers import (WEIGHTS_NAME,
                                   BertConfig, BertForSequenceClassification, BertTokenizer,
                                   XLMConfig, XLMForSequenceClassification, XLMTokenizer,
                                   XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
diff --git a/examples/run_generation.py b/examples/run_generation.py
index a2a8f29103..9e98a9e870 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -26,12 +26,12 @@ import torch
 import torch.nn.functional as F
 import numpy as np
 
-from pytorch_transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig
+from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig
 
-from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
-from pytorch_transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
-from pytorch_transformers import XLNetLMHeadModel, XLNetTokenizer
-from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
+from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
+from transformers import XLNetLMHeadModel, XLNetTokenizer
+from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
 
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
diff --git a/examples/run_glue.py b/examples/run_glue.py
index 99821f454d..71dad0edbf 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -31,7 +31,7 @@ from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
 
-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                   BertForSequenceClassification, BertTokenizer,
                                   RobertaConfig,
                                   RobertaForSequenceClassification,
@@ -44,12 +44,12 @@ from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
                                   DistilBertForSequenceClassification,
                                   DistilBertTokenizer)
 
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
 
-from pytorch_transformers import glue_compute_metrics as compute_metrics
-from pytorch_transformers import glue_output_modes as output_modes
-from pytorch_transformers import glue_processors as processors
-from pytorch_transformers import glue_convert_examples_to_features as convert_examples_to_features
+from transformers import glue_compute_metrics as compute_metrics
+from transformers import glue_output_modes as output_modes
+from transformers import glue_processors as processors
+from transformers import glue_convert_examples_to_features as convert_examples_to_features
 
 logger = logging.getLogger(__name__)
 
@@ -137,7 +137,7 @@ def train(args, train_dataset, model, tokenizer):
                       'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM, DistilBERT and RoBERTa don't use segment_ids
                       'labels':         batch[3]}
             outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
                 loss = loss.mean() # mean() to average on multi-gpu parallel training
@@ -483,7 +483,7 @@ def main():
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
             global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index 556cce6753..7ccf4c3cb7 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -35,7 +35,7 @@ from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
 
-from pytorch_transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
+from transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
                                   BertConfig, BertForMaskedLM, BertTokenizer,
                                   GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
                                   OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
@@ -188,7 +188,7 @@ def train(args, train_dataset, model, tokenizer):
             labels = labels.to(args.device)
             model.train()
             outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
                 loss = loss.mean()  # mean() to average on multi-gpu parallel training
@@ -481,7 +481,7 @@ def main():
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
             global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py
index 05f9a48f50..54f3a8a904 100644
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -32,13 +32,13 @@ from torch.utils.data.distributed import DistributedSampler
 from tensorboardX import SummaryWriter
 from tqdm import tqdm, trange
 
-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                   BertForMultipleChoice, BertTokenizer,
                                   XLNetConfig, XLNetForMultipleChoice,
                                   XLNetTokenizer, RobertaConfig,
                                   RobertaForMultipleChoice, RobertaTokenizer)
 
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
 
 from utils_multiple_choice import (convert_examples_to_features, processors)
 
@@ -141,7 +141,7 @@ def train(args, train_dataset, model, tokenizer):
                       'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                       'labels':         batch[3]}
             outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
                 loss = loss.mean() # mean() to average on multi-gpu parallel training
@@ -508,7 +508,7 @@ def main():
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
             global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
@@ -524,7 +524,7 @@ def main():
         checkpoints = [args.output_dir]
         # if args.eval_all_checkpoints: # can not use this to do test!!
         #     checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-        #     logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        #     logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
             global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
diff --git a/examples/run_squad.py b/examples/run_squad.py
index affef90ca9..0c0fbf2963 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -32,7 +32,7 @@ from tqdm import tqdm, trange
 
 from tensorboardX import SummaryWriter
 
-from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+from transformers import (WEIGHTS_NAME, BertConfig,
                                   BertForQuestionAnswering, BertTokenizer,
                                   XLMConfig, XLMForQuestionAnswering,
                                   XLMTokenizer, XLNetConfig,
@@ -40,7 +40,7 @@ from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
                                   XLNetTokenizer,
                                   DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
 
-from pytorch_transformers import AdamW, WarmupLinearSchedule
+from transformers import AdamW, WarmupLinearSchedule
 
 from utils_squad import (read_squad_examples, convert_examples_to_features,
                          RawResult, write_predictions,
@@ -142,7 +142,7 @@ def train(args, train_dataset, model, tokenizer):
                 inputs.update({'cls_index': batch[5],
                                'p_mask':       batch[6]})
             outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
                 loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
@@ -510,7 +510,7 @@ def main():
         checkpoints = [args.output_dir]
         if args.eval_all_checkpoints:
             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
 
diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py
index 1301d13e08..3a867f80a8 100644
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -1,6 +1,6 @@
 import tensorflow as tf
 import tensorflow_datasets
-from pytorch_transformers import *
+from transformers import *
 
 # Load dataset, tokenizer, model from pretrained model/vocabulary
 tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
diff --git a/examples/utils_squad.py b/examples/utils_squad.py
index 34a0c9cc02..b990ecc842 100644
--- a/examples/utils_squad.py
+++ b/examples/utils_squad.py
@@ -24,7 +24,7 @@ import math
 import collections
 from io import open
 
-from pytorch_transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
+from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
 
 # Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
 from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
diff --git a/hubconf.py b/hubconf.py
index d9aaa6b53a..3fa354ed5a 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -1,7 +1,7 @@
-from pytorch_transformers import (
+from transformers import (
     AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
 )
-from pytorch_transformers.file_utils import add_start_docstrings
+from transformers.file_utils import add_start_docstrings
 
 dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
 
@@ -11,12 +11,12 @@ def config(*args, **kwargs):
                 # Using torch.hub !
                 import torch
 
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
-                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
+                config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
+                config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+                config = torch.hub.load('huggingface/transformers', 'config', './test/bert_saved_model/my_configuration.json')
+                config = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
                 assert config.output_attention == True
-                config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
+                config, unused_kwargs = torch.hub.load('huggingface/transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
                 assert config.output_attention == True
                 assert unused_kwargs == {'foo': False}
 
@@ -31,8 +31,8 @@ def tokenizer(*args, **kwargs):
         # Using torch.hub !
         import torch
 
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+        tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
+        tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
 
     """
 
@@ -45,13 +45,13 @@ def model(*args, **kwargs):
             # Using torch.hub !
             import torch
 
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
             assert model.config.output_attention == True
             # Loading from a TF checkpoint file instead of a PyTorch model (slower)
             config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            model = torch.hub.load('huggingface/transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
 
@@ -63,13 +63,13 @@ def modelWithLMHead(*args, **kwargs):
         # Using torch.hub !
         import torch
 
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
         assert model.config.output_attention == True
         # Loading from a TF checkpoint file instead of a PyTorch model (slower)
         config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
     """
     return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
@@ -81,13 +81,13 @@ def modelForSequenceClassification(*args, **kwargs):
             # Using torch.hub !
             import torch
 
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
             assert model.config.output_attention == True
             # Loading from a TF checkpoint file instead of a PyTorch model (slower)
             config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
 
@@ -100,13 +100,13 @@ def modelForQuestionAnswering(*args, **kwargs):
         # Using torch.hub !
         import torch
 
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
         assert model.config.output_attention == True
         # Loading from a TF checkpoint file instead of a PyTorch model (slower)
         config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
     """
     return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
diff --git a/setup.py b/setup.py
index 903f1d8cac..34cb89560e 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ To create the package for pypi.
    (pypi suggest using twine as other methods upload files via plaintext.)
 
    Check that you can install it in a virtualenv by running:
-   pip install -i https://testpypi.python.org/pypi pytorch-transformers
+   pip install -i https://testpypi.python.org/pypi transformers
 
 6. Upload the final version to actual pypi:
    twine upload dist/* -r pypi
@@ -37,8 +37,8 @@ from io import open
 from setuptools import find_packages, setup
 
 setup(
-    name="pytorch_transformers",
-    version="1.2.0",
+    name="transformers",
+    version="2.0.0",
     author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors",
     author_email="thomas@huggingface.co",
     description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",
@@ -46,7 +46,7 @@ setup(
     long_description_content_type="text/markdown",
     keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU',
     license='Apache',
-    url="https://github.com/huggingface/pytorch-transformers",
+    url="https://github.com/huggingface/transformers",
     packages=find_packages(exclude=["*.tests", "*.tests.*",
                                     "tests.*", "tests"]),
     install_requires=['numpy',
@@ -58,7 +58,7 @@ setup(
                       'sacremoses'],
     entry_points={
       'console_scripts': [
-        "pytorch_transformers=pytorch_transformers.__main__:main",
+        "transformers=transformers.__main__:main",
       ]
     },
     # python_requires='>=3.5.0',
diff --git a/pytorch_transformers/__init__.py b/transformers/__init__.py
similarity index 98%
rename from pytorch_transformers/__init__.py
rename to transformers/__init__.py
index e355add4ad..39370cb327 100644
--- a/pytorch_transformers/__init__.py
+++ b/transformers/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "1.2.0"
+__version__ = "2.0.0"
 
 # Work around to update TensorFlow's absl.logging threshold which alters the
 # default Python logging output behavior when present.
@@ -17,7 +17,7 @@ import logging
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 # Files and general utilities
-from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
+from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
                          cached_path, add_start_docstrings, add_end_docstrings,
                          WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME,
                          is_tf_available, is_torch_available)
diff --git a/pytorch_transformers/__main__.py b/transformers/__main__.py
similarity index 74%
rename from pytorch_transformers/__main__.py
rename to transformers/__main__.py
index ca4936fedf..31dbd24908 100644
--- a/pytorch_transformers/__main__.py
+++ b/transformers/__main__.py
@@ -5,25 +5,25 @@ def main():
         print(
         "This command line utility let you convert original (author released) model checkpoint to pytorch.\n"
         "It should be used as one of: \n"
-        ">> pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
-        ">> pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
-        ">> pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
-        ">> pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
-        ">> pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
-        ">> pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
+        ">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
+        ">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
+        ">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
+        ">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
+        ">> transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
+        ">> transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
     else:
         if sys.argv[1] == "bert":
             try:
                 from .convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
             if len(sys.argv) != 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+                print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
             else:
                 PYTORCH_DUMP_OUTPUT = sys.argv.pop()
                 TF_CONFIG = sys.argv.pop()
@@ -33,7 +33,7 @@ def main():
             from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
             if len(sys.argv) < 4 or len(sys.argv) > 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
+                print("Should be used as `transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
             else:
                 OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -48,13 +48,13 @@ def main():
             try:
                 from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
             if len(sys.argv) < 4 or len(sys.argv) > 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+                print("Should be used as `transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
             else:
                 if 'ckpt' in sys.argv[2].lower():
                     TF_CHECKPOINT = sys.argv[2]
@@ -72,14 +72,14 @@ def main():
             try:
                 from .convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
             if len(sys.argv) < 4 or len(sys.argv) > 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+                print("Should be used as `transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
             else:
                 TF_CHECKPOINT = sys.argv[2]
                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -92,14 +92,14 @@ def main():
             try:
                 from .convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
             if len(sys.argv) < 5 or len(sys.argv) > 6:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
+                print("Should be used as `transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
             else:
                 TF_CHECKPOINT = sys.argv[2]
                 TF_CONFIG = sys.argv[3]
@@ -118,7 +118,7 @@ def main():
 
             if len(sys.argv) != 4:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
+                print("Should be used as `transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
             else:
                 XLM_CHECKPOINT_PATH = sys.argv[2]
                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
diff --git a/pytorch_transformers/configuration_auto.py b/transformers/configuration_auto.py
similarity index 97%
rename from pytorch_transformers/configuration_auto.py
rename to transformers/configuration_auto.py
index 9e35f85dc7..74dda59fcf 100644
--- a/pytorch_transformers/configuration_auto.py
+++ b/transformers/configuration_auto.py
@@ -31,7 +31,7 @@ logger = logging.getLogger(__name__)
 
 
 class AutoConfig(object):
-    r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
+    r""":class:`~transformers.AutoConfig` is a generic configuration class
         that will be instantiated as one of the configuration classes of the library
         when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
         class method.
@@ -76,7 +76,7 @@ class AutoConfig(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
 
             cache_dir: (`optional`) string:
diff --git a/pytorch_transformers/configuration_bert.py b/transformers/configuration_bert.py
similarity index 98%
rename from pytorch_transformers/configuration_bert.py
rename to transformers/configuration_bert.py
index 00a22770ac..122a2c9aab 100644
--- a/pytorch_transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -45,7 +45,7 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class BertConfig(PretrainedConfig):
     r"""
-        :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
+        :class:`~transformers.BertConfig` is the configuration class to store the configuration of a
         `BertModel`.
 
 
diff --git a/pytorch_transformers/configuration_distilbert.py b/transformers/configuration_distilbert.py
similarity index 100%
rename from pytorch_transformers/configuration_distilbert.py
rename to transformers/configuration_distilbert.py
diff --git a/pytorch_transformers/configuration_gpt2.py b/transformers/configuration_gpt2.py
similarity index 100%
rename from pytorch_transformers/configuration_gpt2.py
rename to transformers/configuration_gpt2.py
diff --git a/pytorch_transformers/configuration_openai.py b/transformers/configuration_openai.py
similarity index 100%
rename from pytorch_transformers/configuration_openai.py
rename to transformers/configuration_openai.py
diff --git a/pytorch_transformers/configuration_roberta.py b/transformers/configuration_roberta.py
similarity index 100%
rename from pytorch_transformers/configuration_roberta.py
rename to transformers/configuration_roberta.py
diff --git a/pytorch_transformers/configuration_transfo_xl.py b/transformers/configuration_transfo_xl.py
similarity index 100%
rename from pytorch_transformers/configuration_transfo_xl.py
rename to transformers/configuration_transfo_xl.py
diff --git a/pytorch_transformers/configuration_utils.py b/transformers/configuration_utils.py
similarity index 96%
rename from pytorch_transformers/configuration_utils.py
rename to transformers/configuration_utils.py
index 649a94e28c..8a23be4ff6 100644
--- a/pytorch_transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -59,7 +59,7 @@ class PretrainedConfig(object):
 
     def save_pretrained(self, save_directory):
         """ Save a configuration object to the directory `save_directory`, so that it
-            can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
+            can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
         """
         assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
 
@@ -71,13 +71,13 @@ class PretrainedConfig(object):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
+        r""" Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
 
         Parameters:
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
 
             cache_dir: (`optional`) string:
diff --git a/pytorch_transformers/configuration_xlm.py b/transformers/configuration_xlm.py
similarity index 100%
rename from pytorch_transformers/configuration_xlm.py
rename to transformers/configuration_xlm.py
diff --git a/pytorch_transformers/configuration_xlnet.py b/transformers/configuration_xlnet.py
similarity index 100%
rename from pytorch_transformers/configuration_xlnet.py
rename to transformers/configuration_xlnet.py
diff --git a/pytorch_transformers/convert_bert_original_tf_checkpoint_to_pytorch.py b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
similarity index 96%
rename from pytorch_transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
rename to transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
index d382d3588e..75808811ef 100755
--- a/pytorch_transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import argparse
 import torch
 
-from pytorch_transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
 
 import logging
 logging.basicConfig(level=logging.INFO)
diff --git a/pytorch_transformers/convert_bert_pytorch_checkpoint_to_original_tf.py b/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
similarity index 99%
rename from pytorch_transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
rename to transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
index 15fd6bf5ac..35866caac4 100644
--- a/pytorch_transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
+++ b/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py
@@ -20,7 +20,7 @@ import argparse
 import torch
 import numpy as np
 import tensorflow as tf
-from pytorch_transformers import BertModel
+from transformers import BertModel
 
 
 def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
diff --git a/pytorch_transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
similarity index 98%
rename from pytorch_transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
rename to transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
index eb5b3009b4..e2328c08ca 100755
--- a/pytorch_transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from io import open
 
 import torch
 
-from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
+from transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                      GPT2Config,
                                                      GPT2Model,
                                                      load_tf_weights_in_gpt2)
diff --git a/pytorch_transformers/convert_openai_original_tf_checkpoint_to_pytorch.py b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
similarity index 98%
rename from pytorch_transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
rename to transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
index 5eecdd9648..13ebecf2fd 100755
--- a/pytorch_transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from io import open
 
 import torch
 
-from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
+from transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                      OpenAIGPTConfig,
                                                      OpenAIGPTModel,
                                                      load_tf_weights_in_openai_gpt)
diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
similarity index 97%
rename from pytorch_transformers/convert_pytorch_checkpoint_to_tf2.py
rename to transformers/convert_pytorch_checkpoint_to_tf2.py
index 6c0043d6a7..c5f7650b50 100644
--- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -22,9 +22,9 @@ import os
 import argparse
 import tensorflow as tf
 
-from pytorch_transformers import is_torch_available, cached_path
+from transformers import is_torch_available, cached_path
 
-from pytorch_transformers import (BertConfig, TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, load_bert_pt_weights_in_tf2, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+from transformers import (BertConfig, TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, load_bert_pt_weights_in_tf2, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                   GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                   XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                   XLMConfig, TFXLMWithLMHeadModel, load_xlm_pt_weights_in_tf2, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -36,7 +36,7 @@ from pytorch_transformers import (BertConfig, TFBertForPreTraining, TFBertForQue
 if is_torch_available():
     import torch
     import numpy as np
-    from pytorch_transformers import (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    from transformers import (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                       GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
                                       XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
                                       XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
diff --git a/pytorch_transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
similarity index 98%
rename from pytorch_transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
rename to transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
index 9f74254daa..35f01e9907 100644
--- a/pytorch_transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -23,12 +23,12 @@ import torch
 
 from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 from fairseq.modules import TransformerSentenceEncoderLayer
-from pytorch_transformers import (BertConfig, BertEncoder,
+from transformers import (BertConfig, BertEncoder,
                                                 BertIntermediate, BertLayer,
                                                 BertModel, BertOutput,
                                                 BertSelfAttention,
                                                 BertSelfOutput)
-from pytorch_transformers import (RobertaEmbeddings,
+from transformers import (RobertaEmbeddings,
                                                    RobertaForMaskedLM,
                                                    RobertaForSequenceClassification,
                                                    RobertaModel)
diff --git a/pytorch_transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py b/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
similarity index 94%
rename from pytorch_transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
rename to transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
index b310b73453..a5ff4ed22c 100755
--- a/pytorch_transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
@@ -23,12 +23,12 @@ from io import open
 
 import torch
 
-import pytorch_transformers.tokenization_transfo_xl as data_utils
+import transformers.tokenization_transfo_xl as data_utils
 
-from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
-from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
+from transformers import CONFIG_NAME, WEIGHTS_NAME
+from transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
                                                       load_tf_weights_in_transfo_xl)
-from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
+from transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
diff --git a/pytorch_transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
similarity index 96%
rename from pytorch_transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
rename to transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
index aecd936920..91133ef56a 100755
--- a/pytorch_transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
@@ -23,8 +23,8 @@ from io import open
 import torch
 import numpy
 
-from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
-from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
+from transformers import CONFIG_NAME, WEIGHTS_NAME
+from transformers.tokenization_xlm import VOCAB_FILES_NAMES
 
 import logging
 logging.basicConfig(level=logging.INFO)
diff --git a/pytorch_transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
similarity index 98%
rename from pytorch_transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
rename to transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
index a36fa514b5..3669d9944c 100755
--- a/pytorch_transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py
@@ -22,7 +22,7 @@ import os
 import argparse
 import torch
 
-from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
+from transformers import (CONFIG_NAME, WEIGHTS_NAME,
                                                     XLNetConfig,
                                                     XLNetLMHeadModel, XLNetForQuestionAnswering,
                                                     XLNetForSequenceClassification,
diff --git a/pytorch_transformers/data/__init__.py b/transformers/data/__init__.py
similarity index 100%
rename from pytorch_transformers/data/__init__.py
rename to transformers/data/__init__.py
diff --git a/pytorch_transformers/data/metrics/__init__.py b/transformers/data/metrics/__init__.py
similarity index 100%
rename from pytorch_transformers/data/metrics/__init__.py
rename to transformers/data/metrics/__init__.py
diff --git a/pytorch_transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py
similarity index 100%
rename from pytorch_transformers/data/processors/__init__.py
rename to transformers/data/processors/__init__.py
diff --git a/pytorch_transformers/data/processors/glue.py b/transformers/data/processors/glue.py
similarity index 100%
rename from pytorch_transformers/data/processors/glue.py
rename to transformers/data/processors/glue.py
diff --git a/pytorch_transformers/data/processors/utils.py b/transformers/data/processors/utils.py
similarity index 100%
rename from pytorch_transformers/data/processors/utils.py
rename to transformers/data/processors/utils.py
diff --git a/pytorch_transformers/file_utils.py b/transformers/file_utils.py
similarity index 97%
rename from pytorch_transformers/file_utils.py
rename to transformers/file_utils.py
index 90bdb231f1..47fdb6e8ba 100644
--- a/pytorch_transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -48,7 +48,7 @@ except ImportError:
     torch_cache_home = os.path.expanduser(
         os.getenv('TORCH_HOME', os.path.join(
             os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
-default_cache_path = os.path.join(torch_cache_home, 'pytorch_transformers')
+default_cache_path = os.path.join(torch_cache_home, 'transformers')
 
 try:
     from urllib.parse import urlparse
@@ -65,6 +65,7 @@ except (AttributeError, ImportError):
                                                         default_cache_path))
 
 PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
+TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 
 WEIGHTS_NAME = "pytorch_model.bin"
 TF2_WEIGHTS_NAME = 'tf_model.h5'
@@ -131,7 +132,7 @@ def filename_to_url(filename, cache_dir=None):
     Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
     """
     if cache_dir is None:
-        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+        cache_dir = TRANSFORMERS_CACHE
     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
         cache_dir = str(cache_dir)
 
@@ -162,7 +163,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
         force_download: if True, re-dowload the file even if it's already cached in the cache dir.
     """
     if cache_dir is None:
-        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+        cache_dir = TRANSFORMERS_CACHE
     if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
         url_or_filename = str(url_or_filename)
     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
@@ -251,7 +252,7 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
     If it's not there, download it. Then return the path to the cached file.
     """
     if cache_dir is None:
-        cache_dir = PYTORCH_TRANSFORMERS_CACHE
+        cache_dir = TRANSFORMERS_CACHE
     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
         cache_dir = str(cache_dir)
     if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
diff --git a/pytorch_transformers/modeling_auto.py b/transformers/modeling_auto.py
similarity index 90%
rename from pytorch_transformers/modeling_auto.py
rename to transformers/modeling_auto.py
index 31c8fafaa9..b76a883b19 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -36,7 +36,7 @@ logger = logging.getLogger(__name__)
 
 class AutoModel(object):
     r"""
-        :class:`~pytorch_transformers.AutoModel` is a generic model class
+        :class:`~transformers.AutoModel` is a generic model class
         that will be instantiated as one of the base model classes of the library
         when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
         class method.
@@ -84,23 +84,23 @@ class AutoModel(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -120,7 +120,7 @@ class AutoModel(object):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -157,7 +157,7 @@ class AutoModel(object):
 
 class AutoModelWithLMHead(object):
     r"""
-        :class:`~pytorch_transformers.AutoModelWithLMHead` is a generic model class
+        :class:`~transformers.AutoModelWithLMHead` is a generic model class
         that will be instantiated as one of the language modeling model classes of the library
         when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
         class method.
@@ -208,23 +208,23 @@ class AutoModelWithLMHead(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -244,7 +244,7 @@ class AutoModelWithLMHead(object):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -281,7 +281,7 @@ class AutoModelWithLMHead(object):
 
 class AutoModelForSequenceClassification(object):
     r"""
-        :class:`~pytorch_transformers.AutoModelForSequenceClassification` is a generic model class
+        :class:`~transformers.AutoModelForSequenceClassification` is a generic model class
         that will be instantiated as one of the sequence classification model classes of the library
         when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
         class method.
@@ -326,23 +326,23 @@ class AutoModelForSequenceClassification(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -362,7 +362,7 @@ class AutoModelForSequenceClassification(object):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -392,7 +392,7 @@ class AutoModelForSequenceClassification(object):
 
 class AutoModelForQuestionAnswering(object):
     r"""
-        :class:`~pytorch_transformers.AutoModelForQuestionAnswering` is a generic model class
+        :class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
         that will be instantiated as one of the question answering model classes of the library
         when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
         class method.
@@ -435,23 +435,23 @@ class AutoModelForQuestionAnswering(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -471,7 +471,7 @@ class AutoModelForQuestionAnswering(object):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
diff --git a/pytorch_transformers/modeling_bert.py b/transformers/modeling_bert.py
similarity index 99%
rename from pytorch_transformers/modeling_bert.py
rename to transformers/modeling_bert.py
index 4a61a01641..132d0de4ea 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -486,9 +486,9 @@ BERT_START_DOCSTRING = r"""    The BERT model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. 
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 BERT_INPUTS_DOCSTRING = r"""
@@ -512,9 +512,9 @@ BERT_INPUTS_DOCSTRING = r"""
             Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
             the right rather than the left.
 
-            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.BertTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
diff --git a/pytorch_transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py
similarity index 99%
rename from pytorch_transformers/modeling_distilbert.py
rename to transformers/modeling_distilbert.py
index c5cc44be75..2425ab5f47 100644
--- a/pytorch_transformers/modeling_distilbert.py
+++ b/transformers/modeling_distilbert.py
@@ -372,9 +372,9 @@ DISTILBERT_START_DOCSTRING = r"""
         https://medium.com/huggingface/distilbert-8cf3380435b5
 
     Parameters:
-        config (:class:`~pytorch_transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 DISTILBERT_INPUTS_DOCSTRING = r"""
diff --git a/pytorch_transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py
similarity index 98%
rename from pytorch_transformers/modeling_gpt2.py
rename to transformers/modeling_gpt2.py
index ee246f1731..bc85224022 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -280,9 +280,9 @@ GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 GPT2_INPUTS_DOCSTRING = r"""    Inputs:
@@ -290,9 +290,9 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
             Indices of input sequence tokens in the vocabulary.
             GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
             the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.GPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.GPT2Tokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **past**:
             list of ``torch.FloatTensor`` (one for each layer):
             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
@@ -493,7 +493,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
     Examples::
 
         import torch
-        from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
+        from transformers import GPT2Tokenizer, GPT2LMHeadModel
 
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = GPT2LMHeadModel.from_pretrained('gpt2')
@@ -589,7 +589,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     Examples::
 
         import torch
-        from pytorch_transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
+        from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
         
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
diff --git a/pytorch_transformers/modeling_openai.py b/transformers/modeling_openai.py
similarity index 98%
rename from pytorch_transformers/modeling_openai.py
rename to transformers/modeling_openai.py
index 4b02baf2f4..2827bf11e5 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/transformers/modeling_openai.py
@@ -294,9 +294,9 @@ OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
@@ -304,9 +304,9 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
             Indices of input sequence tokens in the vocabulary.
             GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
             the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
diff --git a/pytorch_transformers/modeling_roberta.py b/transformers/modeling_roberta.py
similarity index 97%
rename from pytorch_transformers/modeling_roberta.py
rename to transformers/modeling_roberta.py
index 9b30bcd4be..04ffbecc16 100644
--- a/pytorch_transformers/modeling_roberta.py
+++ b/transformers/modeling_roberta.py
@@ -77,9 +77,9 @@ ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.RobertaConfig`): Model configuration class with all the parameters of the 
+        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the 
             model. Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 ROBERTA_INPUTS_DOCSTRING = r"""
@@ -102,8 +102,8 @@ ROBERTA_INPUTS_DOCSTRING = r"""
             RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
             the right rather than the left.
 
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
@@ -361,9 +361,9 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
 
                 ``token_type_ids:   0   0   0   0  0     0   0``
 
-            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.BertTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
             Segment token indices to indicate first and second portions of the inputs.
             The second dimension of the input (`num_choices`) indicates the number of choices to score.
diff --git a/pytorch_transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py
similarity index 89%
rename from pytorch_transformers/modeling_tf_auto.py
rename to transformers/modeling_tf_auto.py
index 3d3cc6306e..a8f1de047f 100644
--- a/pytorch_transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -34,7 +34,7 @@ logger = logging.getLogger(__name__)
 
 class TFAutoModel(object):
     r"""
-        :class:`~pytorch_transformers.TFAutoModel` is a generic model class
+        :class:`~transformers.TFAutoModel` is a generic model class
         that will be instantiated as one of the base model classes of the library
         when created with the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)`
         class method.
@@ -79,7 +79,7 @@ class TFAutoModel(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
             from_pt: (`Optional`) Boolean
@@ -88,17 +88,17 @@ class TFAutoModel(object):
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -118,7 +118,7 @@ class TFAutoModel(object):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -155,7 +155,7 @@ class TFAutoModel(object):
 
 class TFAutoModelWithLMHead(object):
     r"""
-        :class:`~pytorch_transformers.TFAutoModelWithLMHead` is a generic model class
+        :class:`~transformers.TFAutoModelWithLMHead` is a generic model class
         that will be instantiated as one of the language modeling model classes of the library
         when created with the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
         class method.
@@ -203,7 +203,7 @@ class TFAutoModelWithLMHead(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
             from_pt: (`Optional`) Boolean
@@ -212,17 +212,17 @@ class TFAutoModelWithLMHead(object):
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -242,7 +242,7 @@ class TFAutoModelWithLMHead(object):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -279,7 +279,7 @@ class TFAutoModelWithLMHead(object):
 
 class TFAutoModelForSequenceClassification(object):
     r"""
-        :class:`~pytorch_transformers.TFAutoModelForSequenceClassification` is a generic model class
+        :class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class
         that will be instantiated as one of the sequence classification model classes of the library
         when created with the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
         class method.
@@ -324,7 +324,7 @@ class TFAutoModelForSequenceClassification(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
             from_pt: (`Optional`) Boolean
@@ -333,17 +333,17 @@ class TFAutoModelForSequenceClassification(object):
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -363,7 +363,7 @@ class TFAutoModelForSequenceClassification(object):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -393,7 +393,7 @@ class TFAutoModelForSequenceClassification(object):
 
 class TFAutoModelForQuestionAnswering(object):
     r"""
-        :class:`~pytorch_transformers.TFAutoModelForQuestionAnswering` is a generic model class
+        :class:`~transformers.TFAutoModelForQuestionAnswering` is a generic model class
         that will be instantiated as one of the question answering model classes of the library
         when created with the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
         class method.
@@ -436,7 +436,7 @@ class TFAutoModelForQuestionAnswering(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
             from_pt: (`Optional`) Boolean
@@ -445,17 +445,17 @@ class TFAutoModelForQuestionAnswering(object):
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -475,7 +475,7 @@ class TFAutoModelForQuestionAnswering(object):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
diff --git a/pytorch_transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
similarity index 97%
rename from pytorch_transformers/modeling_tf_bert.py
rename to transformers/modeling_tf_bert.py
index 51d5155d4b..d763ca991e 100644
--- a/pytorch_transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -581,9 +581,9 @@ BERT_START_DOCSTRING = r"""    The BERT model was proposed in
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. 
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 BERT_INPUTS_DOCSTRING = r"""
@@ -607,9 +607,9 @@ BERT_INPUTS_DOCSTRING = r"""
             Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
             the right rather than the left.
 
-            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.BertTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
@@ -653,7 +653,7 @@ class TFBertModel(TFBertPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import BertTokenizer, TFBertModel
+        from transformers import BertTokenizer, TFBertModel
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertModel.from_pretrained('bert-base-uncased')
@@ -692,7 +692,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import BertTokenizer, TFBertForPreTraining
+        from transformers import BertTokenizer, TFBertForPreTraining
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForPreTraining.from_pretrained('bert-base-uncased')
@@ -738,7 +738,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import BertTokenizer, TFBertForMaskedLM
+        from transformers import BertTokenizer, TFBertForMaskedLM
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
@@ -782,7 +782,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import BertTokenizer, TFBertForNextSentencePrediction
+        from transformers import BertTokenizer, TFBertForNextSentencePrediction
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
@@ -827,7 +827,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import BertTokenizer, TFBertForSequenceClassification
+        from transformers import BertTokenizer, TFBertForSequenceClassification
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
@@ -879,7 +879,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import BertTokenizer, TFBertForMultipleChoice
+        from transformers import BertTokenizer, TFBertForMultipleChoice
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased')
@@ -958,7 +958,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import BertTokenizer, TFBertForTokenClassification
+        from transformers import BertTokenizer, TFBertForTokenClassification
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
@@ -1011,7 +1011,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import BertTokenizer, TFBertForQuestionAnswering
+        from transformers import BertTokenizer, TFBertForQuestionAnswering
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
         model = TFBertForQuestionAnswering.from_pretrained('bert-base-uncased')
diff --git a/pytorch_transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py
similarity index 98%
rename from pytorch_transformers/modeling_tf_distilbert.py
rename to transformers/modeling_tf_distilbert.py
index 2ec73fc43d..2a917a30a4 100644
--- a/pytorch_transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -500,9 +500,9 @@ DISTILBERT_START_DOCSTRING = r"""
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~pytorch_transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 DISTILBERT_INPUTS_DOCSTRING = r"""
@@ -540,7 +540,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import DistilBertTokenizer, TFDistilBertModel
+        from transformers import DistilBertTokenizer, TFDistilBertModel
 
         tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
         model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
@@ -598,7 +598,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
+        from transformers import DistilBertTokenizer, TFDistilBertForMaskedLM
 
         tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
         model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
@@ -653,7 +653,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import BertTokenizer, TFDistilBertForSequenceClassification
+        from transformers import BertTokenizer, TFDistilBertForSequenceClassification
 
         tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
         model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
@@ -710,7 +710,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import BertTokenizer, TFDistilBertForQuestionAnswering
+        from transformers import BertTokenizer, TFDistilBertForQuestionAnswering
 
         tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
         model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
diff --git a/pytorch_transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py
similarity index 97%
rename from pytorch_transformers/modeling_tf_gpt2.py
rename to transformers/modeling_tf_gpt2.py
index 77d78e7a93..e958c2cbf1 100644
--- a/pytorch_transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -385,9 +385,9 @@ GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 GPT2_INPUTS_DOCSTRING = r"""    Inputs:
@@ -395,9 +395,9 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
             Indices of input sequence tokens in the vocabulary.
             GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
             the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **past**:
             list of ``Numpy array`` or ``tf.Tensor`` (one for each layer):
             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
@@ -441,7 +441,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import GPT2Tokenizer, TFGPT2Model
+        from transformers import GPT2Tokenizer, TFGPT2Model
 
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = TFGPT2Model.from_pretrained('gpt2')
@@ -481,7 +481,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import GPT2Tokenizer, TFGPT2LMHeadModel
+        from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
 
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = TFGPT2LMHeadModel.from_pretrained('gpt2')
@@ -537,7 +537,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
+        from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
 
         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
diff --git a/pytorch_transformers/modeling_tf_openai.py b/transformers/modeling_tf_openai.py
similarity index 97%
rename from pytorch_transformers/modeling_tf_openai.py
rename to transformers/modeling_tf_openai.py
index 0704e574a4..7521866c24 100644
--- a/pytorch_transformers/modeling_tf_openai.py
+++ b/transformers/modeling_tf_openai.py
@@ -371,9 +371,9 @@ OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
@@ -381,9 +381,9 @@ OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
             Indices of input sequence tokens in the vocabulary.
             GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
             the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
@@ -419,7 +419,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import OpenAIGPTTokenizer, TFOpenAIGPTModel
+        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTModel
 
         tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
         model = TFOpenAIGPTModel.from_pretrained('openai-gpt')
@@ -455,7 +455,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel
+        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel
 
         tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
         model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
@@ -506,7 +506,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
+        from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
 
         tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
         model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
diff --git a/pytorch_transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py
similarity index 99%
rename from pytorch_transformers/modeling_tf_pytorch_utils.py
rename to transformers/modeling_tf_pytorch_utils.py
index 25a0cf3bf6..66caa95ec7 100644
--- a/pytorch_transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -189,14 +189,14 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
             "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
         raise e
 
-    import pytorch_transformers
+    import transformers
 
     tf_path = os.path.abspath(tf_checkpoint_path)
     logger.info("Loading TensorFlow weights from {}".format(tf_checkpoint_path))
 
     # Instantiate and load the associated TF 2.0 model
     tf_model_class_name = "TF" + pt_model.__class__.__name__  # Add "TF" at the beggining
-    tf_model_class = getattr(pytorch_transformers, tf_model_class_name)
+    tf_model_class = getattr(transformers, tf_model_class_name)
     tf_model = tf_model_class(pt_model.config)
 
     if tf_inputs is None:
diff --git a/pytorch_transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py
similarity index 96%
rename from pytorch_transformers/modeling_tf_roberta.py
rename to transformers/modeling_tf_roberta.py
index 862540cf10..43747133ff 100644
--- a/pytorch_transformers/modeling_tf_roberta.py
+++ b/transformers/modeling_tf_roberta.py
@@ -137,9 +137,9 @@ ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~pytorch_transformers.RobertaConfig`): Model configuration class with all the parameters of the 
+        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the 
             model. Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 ROBERTA_INPUTS_DOCSTRING = r"""
@@ -162,8 +162,8 @@ ROBERTA_INPUTS_DOCSTRING = r"""
             RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
             the right rather than the left.
 
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
@@ -209,7 +209,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import RobertaTokenizer, TFRobertaModel
+        from transformers import RobertaTokenizer, TFRobertaModel
 
         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
         model = TFRobertaModel.from_pretrained('roberta-base')
@@ -286,7 +286,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import RobertaTokenizer, TFRobertaForMaskedLM
+        from transformers import RobertaTokenizer, TFRobertaForMaskedLM
 
         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
         model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
@@ -354,7 +354,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import RobertaTokenizer, TFRobertaForSequenceClassification
+        from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
 
         tokenizer = RoertaTokenizer.from_pretrained('roberta-base')
         model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
diff --git a/pytorch_transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py
similarity index 98%
rename from pytorch_transformers/modeling_tf_transfo_xl.py
rename to transformers/modeling_tf_transfo_xl.py
index 377599d9e5..df8c7e7dc9 100644
--- a/pytorch_transformers/modeling_tf_transfo_xl.py
+++ b/transformers/modeling_tf_transfo_xl.py
@@ -614,9 +614,9 @@ TRANSFO_XL_START_DOCSTRING = r"""    The Transformer-XL model was proposed in
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~pytorch_transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 TRANSFO_XL_INPUTS_DOCSTRING = r"""
@@ -625,9 +625,9 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
             Indices of input sequence tokens in the vocabulary.
             Transformer-XL is a model with relative position embeddings so you can either pad the inputs on
             the right or on the left.
-            Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.TransfoXLTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **mems**: (`optional`)
             list of ``Numpy array`` or ``tf.Tensor`` (one for each layer):
             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
@@ -660,7 +660,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import TransfoXLTokenizer, TFTransfoXLModel
+        from transformers import TransfoXLTokenizer, TFTransfoXLModel
 
         tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
         model = TFTransfoXLModel.from_pretrained('transfo-xl-wt103')
@@ -702,7 +702,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import TransfoXLTokenizer, TFTransfoXLLMHeadModel
+        from transformers import TransfoXLTokenizer, TFTransfoXLLMHeadModel
 
         tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
         model = TFTransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
diff --git a/pytorch_transformers/modeling_tf_transfo_xl_utilities.py b/transformers/modeling_tf_transfo_xl_utilities.py
similarity index 100%
rename from pytorch_transformers/modeling_tf_transfo_xl_utilities.py
rename to transformers/modeling_tf_transfo_xl_utilities.py
diff --git a/pytorch_transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py
similarity index 95%
rename from pytorch_transformers/modeling_tf_utils.py
rename to transformers/modeling_tf_utils.py
index 67402abbd4..06a333af37 100644
--- a/pytorch_transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -32,16 +32,16 @@ logger = logging.getLogger(__name__)
 class TFPreTrainedModel(tf.keras.Model):
     r""" Base class for all TF models.
 
-        :class:`~pytorch_transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
+        :class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
         as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
 
         Class attributes (overridden by derived classes):
-            - ``config_class``: a class derived from :class:`~pytorch_transformers.PretrainedConfig` to use as configuration class for this model architecture.
+            - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
             - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
             - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
 
-                - ``model``: an instance of the relevant subclass of :class:`~pytorch_transformers.PreTrainedModel`,
-                - ``config``: an instance of the relevant subclass of :class:`~pytorch_transformers.PretrainedConfig`,
+                - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
+                - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
                 - ``path``: a path (string) to the TensorFlow checkpoint.
 
             - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
@@ -123,7 +123,7 @@ class TFPreTrainedModel(tf.keras.Model):
 
     def save_pretrained(self, save_directory):
         """ Save a model and its configuration file to a directory, so that it
-            can be re-loaded using the `:func:`~pytorch_transformers.PreTrainedModel.from_pretrained`` class method.
+            can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
         """
         assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
 
@@ -151,17 +151,17 @@ class TFPreTrainedModel(tf.keras.Model):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.
 
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             from_pt: (`optional`) boolean, default False:
@@ -182,7 +182,7 @@ class TFPreTrainedModel(tf.keras.Model):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
diff --git a/pytorch_transformers/modeling_tf_xlm.py b/transformers/modeling_tf_xlm.py
similarity index 97%
rename from pytorch_transformers/modeling_tf_xlm.py
rename to transformers/modeling_tf_xlm.py
index cc404b6f46..f8f199bbe6 100644
--- a/pytorch_transformers/modeling_tf_xlm.py
+++ b/transformers/modeling_tf_xlm.py
@@ -484,9 +484,9 @@ XLM_START_DOCSTRING = r"""    The XLM model was proposed in
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~pytorch_transformers.XLMConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 XLM_INPUTS_DOCSTRING = r"""
@@ -497,9 +497,9 @@ XLM_INPUTS_DOCSTRING = r"""
             XLM is a model with absolute position embeddings so it's usually advised to pad the inputs on
             the right rather than the left.
 
-            Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.XLMTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
@@ -550,7 +550,7 @@ class TFXLMModel(TFXLMPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import XLMTokenizer, TFXLMModel
+        from transformers import XLMTokenizer, TFXLMModel
 
         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
         model = TFXLMModel.from_pretrained('xlm-mlm-en-2048')
@@ -623,7 +623,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import XLMTokenizer, TFXLMWithLMHeadModel
+        from transformers import XLMTokenizer, TFXLMWithLMHeadModel
 
         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
         model = TFXLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
@@ -667,7 +667,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import XLMTokenizer, TFXLMForSequenceClassification
+        from transformers import XLMTokenizer, TFXLMForSequenceClassification
 
         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
         model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
@@ -715,7 +715,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple
+        from transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple
 
         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
         model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
diff --git a/pytorch_transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py
similarity index 98%
rename from pytorch_transformers/modeling_tf_xlnet.py
rename to transformers/modeling_tf_xlnet.py
index fa9a045fd8..9370bd0915 100644
--- a/pytorch_transformers/modeling_tf_xlnet.py
+++ b/transformers/modeling_tf_xlnet.py
@@ -716,9 +716,9 @@ XLNET_START_DOCSTRING = r"""    The XLNet model was proposed in
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 XLNET_INPUTS_DOCSTRING = r"""
@@ -727,9 +727,9 @@ XLNET_INPUTS_DOCSTRING = r"""
             Indices of input sequence tokens in the vocabulary.
             XLNet is a model with relative position embeddings so you can either pad the inputs on
             the right or on the left.
-            Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.XLNetTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
@@ -793,7 +793,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import XLNetTokenizer, TFXLNetModel
+        from transformers import XLNetTokenizer, TFXLNetModel
 
         tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
         model = TFXLNetModel.from_pretrained('xlnet-large-cased')
@@ -835,7 +835,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import XLNetTokenizer, TFXLNetLMHeadModel
+        from transformers import XLNetTokenizer, TFXLNetLMHeadModel
 
         tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
         model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')
@@ -890,7 +890,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import XLNetTokenizer, TFXLNetForSequenceClassification
+        from transformers import XLNetTokenizer, TFXLNetForSequenceClassification
 
         tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
         model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
@@ -943,7 +943,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from pytorch_transformers import XLNetTokenizer, TFXLNetForQuestionAnsweringSimple
+        from transformers import XLNetTokenizer, TFXLNetForQuestionAnsweringSimple
 
         tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
         model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/transformers/modeling_transfo_xl.py
similarity index 98%
rename from pytorch_transformers/modeling_transfo_xl.py
rename to transformers/modeling_transfo_xl.py
index 8925d01c50..6d430e1804 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/transformers/modeling_transfo_xl.py
@@ -531,9 +531,9 @@ TRANSFO_XL_START_DOCSTRING = r"""    The Transformer-XL model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 TRANSFO_XL_INPUTS_DOCSTRING = r"""
@@ -542,9 +542,9 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
             Indices of input sequence tokens in the vocabulary.
             Transformer-XL is a model with relative position embeddings so you can either pad the inputs on
             the right or on the left.
-            Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.TransfoXLTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **mems**: (`optional`)
             list of ``torch.FloatTensor`` (one for each layer):
             that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
diff --git a/pytorch_transformers/modeling_transfo_xl_utilities.py b/transformers/modeling_transfo_xl_utilities.py
similarity index 100%
rename from pytorch_transformers/modeling_transfo_xl_utilities.py
rename to transformers/modeling_transfo_xl_utilities.py
diff --git a/pytorch_transformers/modeling_utils.py b/transformers/modeling_utils.py
similarity index 96%
rename from pytorch_transformers/modeling_utils.py
rename to transformers/modeling_utils.py
index 541ef7c741..ae2df9514a 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -52,16 +52,16 @@ except ImportError:
 class PreTrainedModel(nn.Module):
     r""" Base class for all models.
 
-        :class:`~pytorch_transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
+        :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
         as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
 
         Class attributes (overridden by derived classes):
-            - ``config_class``: a class derived from :class:`~pytorch_transformers.PretrainedConfig` to use as configuration class for this model architecture.
+            - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
             - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
             - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
 
-                - ``model``: an instance of the relevant subclass of :class:`~pytorch_transformers.PreTrainedModel`,
-                - ``config``: an instance of the relevant subclass of :class:`~pytorch_transformers.PretrainedConfig`,
+                - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
+                - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
                 - ``path``: a path (string) to the TensorFlow checkpoint.
 
             - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
@@ -189,7 +189,7 @@ class PreTrainedModel(nn.Module):
 
     def save_pretrained(self, save_directory):
         """ Save a model and its configuration file to a directory, so that it
-            can be re-loaded using the `:func:`~pytorch_transformers.PreTrainedModel.from_pretrained`` class method.
+            can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
         """
         assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
 
@@ -220,24 +220,24 @@ class PreTrainedModel(nn.Module):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
                 - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
 
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -257,7 +257,7 @@ class PreTrainedModel(nn.Module):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -355,7 +355,7 @@ class PreTrainedModel(nn.Module):
             else:
                 # Load from our TensorFlow 2.0 checkpoints
                 try:
-                    from pytorch_transformers import load_tf2_checkpoint_in_pytorch_model
+                    from transformers import load_tf2_checkpoint_in_pytorch_model
                     model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True)
                 except ImportError as e:
                     logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
@@ -554,7 +554,7 @@ class SQuADHead(nn.Module):
     r""" A SQuAD head inspired by XLNet.
 
     Parameters:
-        config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
 
     Inputs:
         **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)``
diff --git a/pytorch_transformers/modeling_xlm.py b/transformers/modeling_xlm.py
similarity index 98%
rename from pytorch_transformers/modeling_xlm.py
rename to transformers/modeling_xlm.py
index 782e2265ce..b29e721556 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/transformers/modeling_xlm.py
@@ -63,7 +63,7 @@ def gelu(x):
     GELU activation
     https://arxiv.org/abs/1606.08415
     https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14
-    https://github.com/huggingface/pytorch-transformers/blob/master/modeling.py
+    https://github.com/huggingface/transformers/blob/master/modeling.py
     """
     # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
     return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
@@ -265,9 +265,9 @@ XLM_START_DOCSTRING = r"""    The XLM model was proposed in
         https://github.com/facebookresearch/XLM
 
     Parameters:
-        config (:class:`~pytorch_transformers.XLMConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 XLM_INPUTS_DOCSTRING = r"""
@@ -278,9 +278,9 @@ XLM_INPUTS_DOCSTRING = r"""
             XLM is a model with absolute position embeddings so it's usually advised to pad the inputs on
             the right rather than the left.
 
-            Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.XLMTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
diff --git a/pytorch_transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py
similarity index 99%
rename from pytorch_transformers/modeling_xlnet.py
rename to transformers/modeling_xlnet.py
index f9960d4945..d6bb2ebd38 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -488,9 +488,9 @@ XLNET_START_DOCSTRING = r"""    The XLNet model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 XLNET_INPUTS_DOCSTRING = r"""
@@ -499,9 +499,9 @@ XLNET_INPUTS_DOCSTRING = r"""
             Indices of input sequence tokens in the vocabulary.
             XLNet is a model with relative position embeddings so you can either pad the inputs on
             the right or on the left.
-            Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            Indices can be obtained using :class:`transformers.XLNetTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             A parallel sequence of tokens (can be used to indicate various portions of the inputs).
             The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and
diff --git a/pytorch_transformers/optimization.py b/transformers/optimization.py
similarity index 100%
rename from pytorch_transformers/optimization.py
rename to transformers/optimization.py
diff --git a/pytorch_transformers/tests/__init__.py b/transformers/tests/__init__.py
similarity index 100%
rename from pytorch_transformers/tests/__init__.py
rename to transformers/tests/__init__.py
diff --git a/pytorch_transformers/tests/configuration_common_test.py b/transformers/tests/configuration_common_test.py
similarity index 100%
rename from pytorch_transformers/tests/configuration_common_test.py
rename to transformers/tests/configuration_common_test.py
diff --git a/pytorch_transformers/tests/conftest.py b/transformers/tests/conftest.py
similarity index 100%
rename from pytorch_transformers/tests/conftest.py
rename to transformers/tests/conftest.py
diff --git a/pytorch_transformers/tests/fixtures/input.txt b/transformers/tests/fixtures/input.txt
similarity index 100%
rename from pytorch_transformers/tests/fixtures/input.txt
rename to transformers/tests/fixtures/input.txt
diff --git a/pytorch_transformers/tests/fixtures/sample_text.txt b/transformers/tests/fixtures/sample_text.txt
similarity index 100%
rename from pytorch_transformers/tests/fixtures/sample_text.txt
rename to transformers/tests/fixtures/sample_text.txt
diff --git a/pytorch_transformers/tests/fixtures/test_sentencepiece.model b/transformers/tests/fixtures/test_sentencepiece.model
similarity index 100%
rename from pytorch_transformers/tests/fixtures/test_sentencepiece.model
rename to transformers/tests/fixtures/test_sentencepiece.model
diff --git a/pytorch_transformers/tests/modeling_auto_test.py b/transformers/tests/modeling_auto_test.py
similarity index 95%
rename from pytorch_transformers/tests/modeling_auto_test.py
rename to transformers/tests/modeling_auto_test.py
index 4b00891c38..af1de29cce 100644
--- a/pytorch_transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -21,15 +21,15 @@ import shutil
 import pytest
 import logging
 
-from pytorch_transformers import is_torch_available
+from transformers import is_torch_available
 
 if is_torch_available():
-    from pytorch_transformers import (AutoConfig, BertConfig,
+    from transformers import (AutoConfig, BertConfig,
                                     AutoModel, BertModel,
                                     AutoModelWithLMHead, BertForMaskedLM,
                                     AutoModelForSequenceClassification, BertForSequenceClassification,
                                     AutoModelForQuestionAnswering, BertForQuestionAnswering)
-    from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
     from .modeling_common_test import (CommonTestCases, ids_tensor)
     from .configuration_common_test import ConfigTester
diff --git a/pytorch_transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py
similarity index 98%
rename from pytorch_transformers/tests/modeling_bert_test.py
rename to transformers/tests/modeling_bert_test.py
index cac1f996e9..633c97e263 100644
--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -20,17 +20,17 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_transformers import is_torch_available
+from transformers import is_torch_available
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
 
 if is_torch_available():
-    from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
+    from transformers import (BertConfig, BertModel, BertForMaskedLM,
                                         BertForNextSentencePrediction, BertForPreTraining,
                                         BertForQuestionAnswering, BertForSequenceClassification,
                                         BertForTokenClassification, BertForMultipleChoice)
-    from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 else:
     pytestmark = pytest.mark.skip("Require Torch")
 
@@ -310,7 +310,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
similarity index 99%
rename from pytorch_transformers/tests/modeling_common_test.py
rename to transformers/tests/modeling_common_test.py
index 0762a7a634..2b66757c28 100644
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -27,12 +27,12 @@ import unittest
 import logging
 import pytest
 
-from pytorch_transformers import is_torch_available
+from transformers import is_torch_available
 
 if is_torch_available():
     import torch
 
-    from pytorch_transformers import (PretrainedConfig, PreTrainedModel,
+    from transformers import (PretrainedConfig, PreTrainedModel,
                                     BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                     GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
 else:
@@ -621,7 +621,7 @@ class CommonTestCases:
                 [[], []])
 
         def create_and_check_model_from_pretrained(self):
-            cache_dir = "/tmp/pytorch_transformers_test/"
+            cache_dir = "/tmp/transformers_test/"
             for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
                 model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
                 shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_distilbert_test.py b/transformers/tests/modeling_distilbert_test.py
similarity index 97%
rename from pytorch_transformers/tests/modeling_distilbert_test.py
rename to transformers/tests/modeling_distilbert_test.py
index 8fef9d5833..937d03396d 100644
--- a/pytorch_transformers/tests/modeling_distilbert_test.py
+++ b/transformers/tests/modeling_distilbert_test.py
@@ -19,10 +19,10 @@ from __future__ import print_function
 import unittest
 import pytest
 
-from pytorch_transformers import is_torch_available
+from transformers import is_torch_available
 
 if is_torch_available():
-    from pytorch_transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
+    from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
                                     DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
 else:
     pytestmark = pytest.mark.skip("Require Torch")
@@ -211,7 +211,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
 
     # @pytest.mark.slow
     # def test_model_from_pretrained(self):
-    #     cache_dir = "/tmp/pytorch_transformers_test/"
+    #     cache_dir = "/tmp/transformers_test/"
     #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
     #         model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
     #         shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_gpt2_test.py b/transformers/tests/modeling_gpt2_test.py
similarity index 98%
rename from pytorch_transformers/tests/modeling_gpt2_test.py
rename to transformers/tests/modeling_gpt2_test.py
index e5accfa8cf..4263e51bc9 100644
--- a/pytorch_transformers/tests/modeling_gpt2_test.py
+++ b/transformers/tests/modeling_gpt2_test.py
@@ -20,10 +20,10 @@ import unittest
 import pytest
 import shutil
 
-from pytorch_transformers import is_torch_available
+from transformers import is_torch_available
 
 if is_torch_available():
-    from pytorch_transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+    from transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
                                     GPT2LMHeadModel, GPT2DoubleHeadsModel)
 else:
     pytestmark = pytest.mark.skip("Require Torch")
@@ -237,7 +237,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_openai_test.py b/transformers/tests/modeling_openai_test.py
similarity index 97%
rename from pytorch_transformers/tests/modeling_openai_test.py
rename to transformers/tests/modeling_openai_test.py
index 6df4406d03..33218288a0 100644
--- a/pytorch_transformers/tests/modeling_openai_test.py
+++ b/transformers/tests/modeling_openai_test.py
@@ -20,10 +20,10 @@ import unittest
 import pytest
 import shutil
 
-from pytorch_transformers import is_torch_available
+from transformers import is_torch_available
 
 if is_torch_available():
-    from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    from transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                     OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
 else:
     pytestmark = pytest.mark.skip("Require Torch")
@@ -205,7 +205,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_roberta_test.py b/transformers/tests/modeling_roberta_test.py
similarity index 96%
rename from pytorch_transformers/tests/modeling_roberta_test.py
rename to transformers/tests/modeling_roberta_test.py
index 11f2893671..82e10da915 100644
--- a/pytorch_transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -20,12 +20,12 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_transformers import is_torch_available
+from transformers import is_torch_available
 
 if is_torch_available():
     import torch
-    from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification)
-    from pytorch_transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+    from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification)
+    from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 else:
     pytestmark = pytest.mark.skip("Require Torch")
 
@@ -180,7 +180,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = RobertaModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_tf_auto_test.py b/transformers/tests/modeling_tf_auto_test.py
similarity index 95%
rename from pytorch_transformers/tests/modeling_tf_auto_test.py
rename to transformers/tests/modeling_tf_auto_test.py
index 7b080bafcd..2cda3abc1c 100644
--- a/pytorch_transformers/tests/modeling_tf_auto_test.py
+++ b/transformers/tests/modeling_tf_auto_test.py
@@ -21,15 +21,15 @@ import shutil
 import pytest
 import logging
 
-from pytorch_transformers import is_tf_available
+from transformers import is_tf_available
 
 if is_tf_available():
-    from pytorch_transformers import (AutoConfig, BertConfig,
+    from transformers import (AutoConfig, BertConfig,
                                       TFAutoModel, TFBertModel,
                                       TFAutoModelWithLMHead, TFBertForMaskedLM,
                                       TFAutoModelForSequenceClassification, TFBertForSequenceClassification,
                                       TFAutoModelForQuestionAnswering, TFBertForQuestionAnswering)
-    from pytorch_transformers.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    from transformers.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
     from .modeling_common_test import (CommonTestCases, ids_tensor)
     from .configuration_common_test import ConfigTester
diff --git a/pytorch_transformers/tests/modeling_tf_bert_test.py b/transformers/tests/modeling_tf_bert_test.py
similarity index 98%
rename from pytorch_transformers/tests/modeling_tf_bert_test.py
rename to transformers/tests/modeling_tf_bert_test.py
index 53499110f2..a1715d2568 100644
--- a/pytorch_transformers/tests/modeling_tf_bert_test.py
+++ b/transformers/tests/modeling_tf_bert_test.py
@@ -24,11 +24,11 @@ import sys
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
 
-from pytorch_transformers import BertConfig, is_tf_available
+from transformers import BertConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from pytorch_transformers.modeling_tf_bert import (TFBertModel, TFBertForMaskedLM,
+    from transformers.modeling_tf_bert import (TFBertModel, TFBertForMaskedLM,
                                                        TFBertForNextSentencePrediction,
                                                        TFBertForPreTraining,
                                                        TFBertForSequenceClassification,
@@ -315,7 +315,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
         for model_name in ['bert-base-uncased']:
             model = TFBertModel.from_pretrained(model_name, cache_dir=cache_dir)
diff --git a/pytorch_transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
similarity index 96%
rename from pytorch_transformers/tests/modeling_tf_common_test.py
rename to transformers/tests/modeling_tf_common_test.py
index 5e7d29cb7f..483f031b16 100644
--- a/pytorch_transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -26,13 +26,13 @@ import uuid
 import pytest
 import sys
 
-from pytorch_transformers import is_tf_available, is_torch_available
+from transformers import is_tf_available, is_torch_available
 
 if is_tf_available():
     import tensorflow as tf
     import numpy as np
-    from pytorch_transformers import TFPreTrainedModel
-    # from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    from transformers import TFPreTrainedModel
+    # from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 else:
     pytestmark = pytest.mark.skip("Require TensorFlow")
 
@@ -71,19 +71,19 @@ class TFCommonTestCases:
             if not is_torch_available():
                 return
 
-            import pytorch_transformers
+            import transformers
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
             for model_class in self.all_model_classes:
                 pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beggining
-                pt_model_class = getattr(pytorch_transformers, pt_model_class_name)
+                pt_model_class = getattr(transformers, pt_model_class_name)
 
                 tf_model = model_class(config)
                 pt_model = pt_model_class(config)
 
-                tf_model = pytorch_transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=inputs_dict)
-                pt_model = pytorch_transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
+                tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=inputs_dict)
+                pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
 
 
         def test_keyword_and_dict_args(self):
diff --git a/pytorch_transformers/tests/modeling_tf_distilbert_test.py b/transformers/tests/modeling_tf_distilbert_test.py
similarity index 97%
rename from pytorch_transformers/tests/modeling_tf_distilbert_test.py
rename to transformers/tests/modeling_tf_distilbert_test.py
index 5d7f0f7a31..e6d3795914 100644
--- a/pytorch_transformers/tests/modeling_tf_distilbert_test.py
+++ b/transformers/tests/modeling_tf_distilbert_test.py
@@ -22,11 +22,11 @@ import pytest
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
 
-from pytorch_transformers import DistilBertConfig, is_tf_available
+from transformers import DistilBertConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from pytorch_transformers.modeling_tf_distilbert import (TFDistilBertModel,
+    from transformers.modeling_tf_distilbert import (TFDistilBertModel,
                                                              TFDistilBertForMaskedLM,
                                                              TFDistilBertForQuestionAnswering,
                                                              TFDistilBertForSequenceClassification)
@@ -212,7 +212,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
 
     # @pytest.mark.slow
     # def test_model_from_pretrained(self):
-    #     cache_dir = "/tmp/pytorch_transformers_test/"
+    #     cache_dir = "/tmp/transformers_test/"
     #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
     #         model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
     #         shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_tf_gpt2_test.py b/transformers/tests/modeling_tf_gpt2_test.py
similarity index 98%
rename from pytorch_transformers/tests/modeling_tf_gpt2_test.py
rename to transformers/tests/modeling_tf_gpt2_test.py
index 490d5c4e32..658456d15b 100644
--- a/pytorch_transformers/tests/modeling_tf_gpt2_test.py
+++ b/transformers/tests/modeling_tf_gpt2_test.py
@@ -24,11 +24,11 @@ import sys
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
 
-from pytorch_transformers import GPT2Config, is_tf_available
+from transformers import GPT2Config, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from pytorch_transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel,
+    from transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel,
                                                        TFGPT2DoubleHeadsModel,
                                                        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
 else:
@@ -221,7 +221,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_gpt2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = TFGPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_tf_openai_gpt_test.py b/transformers/tests/modeling_tf_openai_gpt_test.py
similarity index 97%
rename from pytorch_transformers/tests/modeling_tf_openai_gpt_test.py
rename to transformers/tests/modeling_tf_openai_gpt_test.py
index c553209db3..d470c8862d 100644
--- a/pytorch_transformers/tests/modeling_tf_openai_gpt_test.py
+++ b/transformers/tests/modeling_tf_openai_gpt_test.py
@@ -24,11 +24,11 @@ import sys
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
 
-from pytorch_transformers import OpenAIGPTConfig, is_tf_available
+from transformers import OpenAIGPTConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from pytorch_transformers.modeling_tf_openai import (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
+    from transformers.modeling_tf_openai import (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
                                                          TFOpenAIGPTDoubleHeadsModel,
                                                          TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
 else:
@@ -220,7 +220,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = TFOpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_tf_roberta_test.py b/transformers/tests/modeling_tf_roberta_test.py
similarity index 97%
rename from pytorch_transformers/tests/modeling_tf_roberta_test.py
rename to transformers/tests/modeling_tf_roberta_test.py
index 1fbd5d0f5f..735c9aae27 100644
--- a/pytorch_transformers/tests/modeling_tf_roberta_test.py
+++ b/transformers/tests/modeling_tf_roberta_test.py
@@ -23,12 +23,12 @@ import pytest
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
 
-from pytorch_transformers import RobertaConfig, is_tf_available
+from transformers import RobertaConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
     import numpy
-    from pytorch_transformers.modeling_tf_roberta import (TFRobertaModel, TFRobertaForMaskedLM,
+    from transformers.modeling_tf_roberta import (TFRobertaModel, TFRobertaForMaskedLM,
                                                           TFRobertaForSequenceClassification,
                                                           TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
 else:
@@ -178,7 +178,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = TFRobertaModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_tf_transfo_xl_test.py b/transformers/tests/modeling_tf_transfo_xl_test.py
similarity index 97%
rename from pytorch_transformers/tests/modeling_tf_transfo_xl_test.py
rename to transformers/tests/modeling_tf_transfo_xl_test.py
index 58a4343dfd..534fe39646 100644
--- a/pytorch_transformers/tests/modeling_tf_transfo_xl_test.py
+++ b/transformers/tests/modeling_tf_transfo_xl_test.py
@@ -24,11 +24,11 @@ import pytest
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
 
-from pytorch_transformers import TransfoXLConfig, is_tf_available
+from transformers import TransfoXLConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from pytorch_transformers.modeling_tf_transfo_xl import (TFTransfoXLModel,
+    from transformers.modeling_tf_transfo_xl import (TFTransfoXLModel,
                                                              TFTransfoXLLMHeadModel,
                                                              TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
 else:
@@ -206,7 +206,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = TFTransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_tf_xlm_test.py b/transformers/tests/modeling_tf_xlm_test.py
similarity index 98%
rename from pytorch_transformers/tests/modeling_tf_xlm_test.py
rename to transformers/tests/modeling_tf_xlm_test.py
index 26329bebb6..1bd661bebf 100644
--- a/pytorch_transformers/tests/modeling_tf_xlm_test.py
+++ b/transformers/tests/modeling_tf_xlm_test.py
@@ -20,11 +20,11 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_transformers import is_tf_available
+from transformers import is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from pytorch_transformers import (XLMConfig, TFXLMModel,
+    from transformers import (XLMConfig, TFXLMModel,
                                       TFXLMWithLMHeadModel,
                                       TFXLMForSequenceClassification,
                                       TFXLMForQuestionAnsweringSimple,
@@ -253,7 +253,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_tf_xlnet_test.py b/transformers/tests/modeling_tf_xlnet_test.py
similarity index 98%
rename from pytorch_transformers/tests/modeling_tf_xlnet_test.py
rename to transformers/tests/modeling_tf_xlnet_test.py
index 01c4494664..6a0434938f 100644
--- a/pytorch_transformers/tests/modeling_tf_xlnet_test.py
+++ b/transformers/tests/modeling_tf_xlnet_test.py
@@ -23,12 +23,12 @@ import random
 import shutil
 import pytest
 
-from pytorch_transformers import XLNetConfig, is_tf_available
+from transformers import XLNetConfig, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
 
-    from pytorch_transformers.modeling_tf_xlnet import (TFXLNetModel, TFXLNetLMHeadModel,
+    from transformers.modeling_tf_xlnet import (TFXLNetModel, TFXLNetLMHeadModel,
                                                         TFXLNetForSequenceClassification,
                                                         TFXLNetForQuestionAnsweringSimple,
                                                         TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
@@ -291,7 +291,7 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = TFXLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_transfo_xl_test.py b/transformers/tests/modeling_transfo_xl_test.py
similarity index 96%
rename from pytorch_transformers/tests/modeling_transfo_xl_test.py
rename to transformers/tests/modeling_transfo_xl_test.py
index 035f20991f..f7b913da5b 100644
--- a/pytorch_transformers/tests/modeling_transfo_xl_test.py
+++ b/transformers/tests/modeling_transfo_xl_test.py
@@ -21,12 +21,12 @@ import random
 import shutil
 import pytest
 
-from pytorch_transformers import is_torch_available
+from transformers import is_torch_available
 
 if is_torch_available():
     import torch
-    from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
-    from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
+    from transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
+    from transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
 else:
     pytestmark = pytest.mark.skip("Require Torch")
 
@@ -206,7 +206,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_xlm_test.py b/transformers/tests/modeling_xlm_test.py
similarity index 97%
rename from pytorch_transformers/tests/modeling_xlm_test.py
rename to transformers/tests/modeling_xlm_test.py
index c3f75e2623..0133febb58 100644
--- a/pytorch_transformers/tests/modeling_xlm_test.py
+++ b/transformers/tests/modeling_xlm_test.py
@@ -20,12 +20,12 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_transformers import is_torch_available
+from transformers import is_torch_available
 
 if is_torch_available():
-    from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
+    from transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
                                       XLMForSequenceClassification, XLMForQuestionAnsweringSimple)
-    from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
+    from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
 else:
     pytestmark = pytest.mark.skip("Require Torch")
 
@@ -314,7 +314,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/modeling_xlnet_test.py b/transformers/tests/modeling_xlnet_test.py
similarity index 97%
rename from pytorch_transformers/tests/modeling_xlnet_test.py
rename to transformers/tests/modeling_xlnet_test.py
index 8c8b3f964f..10cbdaf37b 100644
--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/transformers/tests/modeling_xlnet_test.py
@@ -23,13 +23,13 @@ import random
 import shutil
 import pytest
 
-from pytorch_transformers import is_torch_available
+from transformers import is_torch_available
 
 if is_torch_available():
     import torch
 
-    from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
-    from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
+    from transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
+    from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
 else:
     pytestmark = pytest.mark.skip("Require Torch")
 
@@ -317,7 +317,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
         for model_name in list(XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = XLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_transformers/tests/optimization_test.py b/transformers/tests/optimization_test.py
similarity index 97%
rename from pytorch_transformers/tests/optimization_test.py
rename to transformers/tests/optimization_test.py
index c1c6270f32..84dbaca52a 100644
--- a/pytorch_transformers/tests/optimization_test.py
+++ b/transformers/tests/optimization_test.py
@@ -20,12 +20,12 @@ import unittest
 import os
 import pytest
 
-from pytorch_transformers import is_torch_available
+from transformers import is_torch_available
 
 if is_torch_available():
     import torch
 
-    from pytorch_transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule,
+    from transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule,
                                     WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
 else:
     pytestmark = pytest.mark.skip("Require Torch")
diff --git a/pytorch_transformers/tests/tokenization_auto_test.py b/transformers/tests/tokenization_auto_test.py
similarity index 88%
rename from pytorch_transformers/tests/tokenization_auto_test.py
rename to transformers/tests/tokenization_auto_test.py
index 7cee7ebc28..0f49ec75fb 100644
--- a/pytorch_transformers/tests/tokenization_auto_test.py
+++ b/transformers/tests/tokenization_auto_test.py
@@ -21,8 +21,8 @@ import shutil
 import pytest
 import logging
 
-from pytorch_transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
-from pytorch_transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
+from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
+from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 
 
 class AutoTokenizerTest(unittest.TestCase):
diff --git a/pytorch_transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py
similarity index 98%
rename from pytorch_transformers/tests/tokenization_bert_test.py
rename to transformers/tests/tokenization_bert_test.py
index 4cfdfed136..b70941f884 100644
--- a/pytorch_transformers/tests/tokenization_bert_test.py
+++ b/transformers/tests/tokenization_bert_test.py
@@ -18,7 +18,7 @@ import os
 import unittest
 from io import open
 
-from pytorch_transformers.tokenization_bert import (BasicTokenizer,
+from transformers.tokenization_bert import (BasicTokenizer,
                                                     BertTokenizer,
                                                     WordpieceTokenizer,
                                                     _is_control, _is_punctuation,
diff --git a/pytorch_transformers/tests/tokenization_distilbert_test.py b/transformers/tests/tokenization_distilbert_test.py
similarity index 95%
rename from pytorch_transformers/tests/tokenization_distilbert_test.py
rename to transformers/tests/tokenization_distilbert_test.py
index 674c78d104..64a88df99f 100644
--- a/pytorch_transformers/tests/tokenization_distilbert_test.py
+++ b/transformers/tests/tokenization_distilbert_test.py
@@ -18,7 +18,7 @@ import os
 import unittest
 from io import open
 
-from pytorch_transformers.tokenization_distilbert import (DistilBertTokenizer)
+from transformers.tokenization_distilbert import (DistilBertTokenizer)
 
 from .tokenization_tests_commons import CommonTestCases
 from .tokenization_bert_test import BertTokenizationTest
diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/transformers/tests/tokenization_gpt2_test.py
similarity index 97%
rename from pytorch_transformers/tests/tokenization_gpt2_test.py
rename to transformers/tests/tokenization_gpt2_test.py
index ecc8c7da67..a77cc75ec2 100644
--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/transformers/tests/tokenization_gpt2_test.py
@@ -19,7 +19,7 @@ import unittest
 import json
 from io import open
 
-from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import CommonTestCases
 
diff --git a/pytorch_transformers/tests/tokenization_openai_test.py b/transformers/tests/tokenization_openai_test.py
similarity index 96%
rename from pytorch_transformers/tests/tokenization_openai_test.py
rename to transformers/tests/tokenization_openai_test.py
index 6b86416d2d..56aa219ddc 100644
--- a/pytorch_transformers/tests/tokenization_openai_test.py
+++ b/transformers/tests/tokenization_openai_test.py
@@ -18,7 +18,7 @@ import os
 import unittest
 import json
 
-from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import CommonTestCases
 
diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/transformers/tests/tokenization_roberta_test.py
similarity index 97%
rename from pytorch_transformers/tests/tokenization_roberta_test.py
rename to transformers/tests/tokenization_roberta_test.py
index f7807d0c80..f14b26a2e4 100644
--- a/pytorch_transformers/tests/tokenization_roberta_test.py
+++ b/transformers/tests/tokenization_roberta_test.py
@@ -19,7 +19,7 @@ import json
 import unittest
 from io import open
 
-from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
 from .tokenization_tests_commons import CommonTestCases
 
 
diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
similarity index 100%
rename from pytorch_transformers/tests/tokenization_tests_commons.py
rename to transformers/tests/tokenization_tests_commons.py
diff --git a/pytorch_transformers/tests/tokenization_transfo_xl_test.py b/transformers/tests/tokenization_transfo_xl_test.py
similarity index 94%
rename from pytorch_transformers/tests/tokenization_transfo_xl_test.py
rename to transformers/tests/tokenization_transfo_xl_test.py
index 563871f690..4e99484b0c 100644
--- a/pytorch_transformers/tests/tokenization_transfo_xl_test.py
+++ b/transformers/tests/tokenization_transfo_xl_test.py
@@ -19,11 +19,11 @@ import unittest
 import pytest
 from io import open
 
-from pytorch_transformers import is_torch_available
+from transformers import is_torch_available
 
 if is_torch_available():
     import torch
-    from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
+    from transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
 else:
     pytestmark = pytest.mark.skip("Require Torch")  # TODO: untangle Transfo-XL tokenizer from torch.load and torch.save
 
diff --git a/pytorch_transformers/tests/tokenization_utils_test.py b/transformers/tests/tokenization_utils_test.py
similarity index 93%
rename from pytorch_transformers/tests/tokenization_utils_test.py
rename to transformers/tests/tokenization_utils_test.py
index 26ec2d7a39..cf55982c8f 100644
--- a/pytorch_transformers/tests/tokenization_utils_test.py
+++ b/transformers/tests/tokenization_utils_test.py
@@ -19,8 +19,8 @@ from __future__ import print_function
 import unittest
 import six
 
-from pytorch_transformers import PreTrainedTokenizer
-from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
+from transformers import PreTrainedTokenizer
+from transformers.tokenization_gpt2 import GPT2Tokenizer
 
 class TokenizerUtilsTest(unittest.TestCase):
     def check_tokenizer_from_pretrained(self, tokenizer_class):
diff --git a/pytorch_transformers/tests/tokenization_xlm_test.py b/transformers/tests/tokenization_xlm_test.py
similarity index 97%
rename from pytorch_transformers/tests/tokenization_xlm_test.py
rename to transformers/tests/tokenization_xlm_test.py
index 13fdb4a8bb..b1a71ede59 100644
--- a/pytorch_transformers/tests/tokenization_xlm_test.py
+++ b/transformers/tests/tokenization_xlm_test.py
@@ -18,7 +18,7 @@ import os
 import unittest
 import json
 
-from pytorch_transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
+from transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import CommonTestCases
 
diff --git a/pytorch_transformers/tests/tokenization_xlnet_test.py b/transformers/tests/tokenization_xlnet_test.py
similarity index 98%
rename from pytorch_transformers/tests/tokenization_xlnet_test.py
rename to transformers/tests/tokenization_xlnet_test.py
index a6e9f23fe7..f4418c7fe5 100644
--- a/pytorch_transformers/tests/tokenization_xlnet_test.py
+++ b/transformers/tests/tokenization_xlnet_test.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 
-from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
+from transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
 
 from .tokenization_tests_commons import CommonTestCases
 
diff --git a/pytorch_transformers/tokenization_auto.py b/transformers/tokenization_auto.py
similarity index 96%
rename from pytorch_transformers/tokenization_auto.py
rename to transformers/tokenization_auto.py
index 889774b36c..504727dcc8 100644
--- a/pytorch_transformers/tokenization_auto.py
+++ b/transformers/tokenization_auto.py
@@ -30,7 +30,7 @@ from .tokenization_distilbert import DistilBertTokenizer
 logger = logging.getLogger(__name__)
 
 class AutoTokenizer(object):
-    r""":class:`~pytorch_transformers.AutoTokenizer` is a generic tokenizer class
+    r""":class:`~transformers.AutoTokenizer` is a generic tokenizer class
         that will be instantiated as one of the tokenizer classes of the library
         when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
         class method.
@@ -75,7 +75,7 @@ class AutoTokenizer(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
 
             cache_dir: (`optional`) string:
@@ -90,7 +90,7 @@ class AutoTokenizer(object):
 
             inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
 
-            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
+            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details.
 
         Examples::
 
diff --git a/pytorch_transformers/tokenization_bert.py b/transformers/tokenization_bert.py
similarity index 99%
rename from pytorch_transformers/tokenization_bert.py
rename to transformers/tokenization_bert.py
index 225152e065..42163cb8ec 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/transformers/tokenization_bert.py
@@ -103,7 +103,7 @@ def whitespace_tokenize(text):
 class BertTokenizer(PreTrainedTokenizer):
     r"""
     Constructs a BertTokenizer.
-    :class:`~pytorch_transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
+    :class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
 
     Args:
         vocab_file: Path to a one-wordpiece-per-line vocabulary file
diff --git a/pytorch_transformers/tokenization_distilbert.py b/transformers/tokenization_distilbert.py
similarity index 93%
rename from pytorch_transformers/tokenization_distilbert.py
rename to transformers/tokenization_distilbert.py
index 5a6d02f98d..dfa02926d8 100644
--- a/pytorch_transformers/tokenization_distilbert.py
+++ b/transformers/tokenization_distilbert.py
@@ -45,7 +45,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 class DistilBertTokenizer(BertTokenizer):
     r"""
     Constructs a DistilBertTokenizer.
-    :class:`~pytorch_transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
+    :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
 
     Args:
         vocab_file: Path to a one-wordpiece-per-line vocabulary file
diff --git a/pytorch_transformers/tokenization_gpt2.py b/transformers/tokenization_gpt2.py
similarity index 100%
rename from pytorch_transformers/tokenization_gpt2.py
rename to transformers/tokenization_gpt2.py
diff --git a/pytorch_transformers/tokenization_openai.py b/transformers/tokenization_openai.py
similarity index 100%
rename from pytorch_transformers/tokenization_openai.py
rename to transformers/tokenization_openai.py
diff --git a/pytorch_transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py
similarity index 100%
rename from pytorch_transformers/tokenization_roberta.py
rename to transformers/tokenization_roberta.py
diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/transformers/tokenization_transfo_xl.py
similarity index 100%
rename from pytorch_transformers/tokenization_transfo_xl.py
rename to transformers/tokenization_transfo_xl.py
diff --git a/pytorch_transformers/tokenization_utils.py b/transformers/tokenization_utils.py
similarity index 98%
rename from pytorch_transformers/tokenization_utils.py
rename to transformers/tokenization_utils.py
index ec5de3b772..e8ffff3cb9 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -236,13 +236,13 @@ class PreTrainedTokenizer(object):
     @classmethod
     def from_pretrained(cls, *inputs, **kwargs):
         r"""
-        Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
+        Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
 
         Args:
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
 
             cache_dir: (`optional`) string:
@@ -257,7 +257,7 @@ class PreTrainedTokenizer(object):
 
             inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
 
-            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
+            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details.
 
         Examples::
 
@@ -432,7 +432,7 @@ class PreTrainedTokenizer(object):
             This won't save modifications other than (added tokens and special token mapping) you may have
             applied to the tokenizer after the instantion (e.g. modifying tokenizer.do_lower_case after creation).
 
-            This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
+            This method make sure the full tokenizer can then be re-loaded using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
         """
         if not os.path.isdir(save_directory):
             logger.error("Saving directory ({}) should be a directory".format(save_directory))
@@ -469,7 +469,7 @@ class PreTrainedTokenizer(object):
         """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
             and special token mappings.
 
-            Please use :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
+            Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
         """
         raise NotImplementedError
 
@@ -916,7 +916,7 @@ class PreTrainedTokenizer(object):
 
         # To avoid mixing byte-level and unicode for byte-level BPT
         # we need to build string separatly for added tokens and byte-level tokens
-        # cf. https://github.com/huggingface/pytorch-transformers/issues/1133
+        # cf. https://github.com/huggingface/transformers/issues/1133
         sub_texts = []
         current_sub_text = []
         for token in filtered_tokens:
diff --git a/pytorch_transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
similarity index 100%
rename from pytorch_transformers/tokenization_xlm.py
rename to transformers/tokenization_xlm.py
diff --git a/pytorch_transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py
similarity index 100%
rename from pytorch_transformers/tokenization_xlnet.py
rename to transformers/tokenization_xlnet.py