diff --git a/.circleci/config.yml b/.circleci/config.yml
index 858ca001d6..ac23723f98 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,7 +1,7 @@
 version: 2
 jobs:
     build_py3:
-        working_directory: ~/pytorch-pretrained-BERT
+        working_directory: ~/pytorch-transformers
         docker:
             - image: circleci/python:3.5
         steps:
@@ -10,11 +10,11 @@ jobs:
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
-            - run: python -m pytest -sv ./pytorch_pretrained_bert/tests/ --cov
+            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
             - run: codecov
         parallelism: 4
     build_py2:
-        working_directory: ~/pytorch-pretrained-BERT
+        working_directory: ~/pytorch-transformers
         docker:
             - image: circleci/python:2.7
         steps:
@@ -23,7 +23,7 @@ jobs:
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
-            - run: python -m pytest -sv ./pytorch_pretrained_bert/tests/ --cov
+            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
             - run: codecov
         parallelism: 4
 workflows:
diff --git a/.coveragerc b/.coveragerc
index fe05dda9a8..9b8c40ecf1 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,5 @@
 [run]
-source=pytorch_pretrained_bert
+source=pytorch_transformers
 [report]
 exclude_lines =
     pragma: no cover
diff --git a/README.md b/README.md
index a5234bd9ba..b1e80edc89 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # PyTorch Pretrained BERT: The Big & Extending Repository of pretrained Transformers
 
-[![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT)
+[![CircleCI](https://circleci.com/gh/huggingface/pytorch-transformers.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-transformers)
 
 This repository contains op-for-op PyTorch reimplementations, pre-trained models and fine-tuning examples for:
 
@@ -47,7 +47,7 @@ This repo was tested on Python 2.7 and 3.5+ (examples are tested only on python
 
 PyTorch pretrained bert can be installed by pip as follows:
 ```bash
-pip install pytorch-pretrained-bert
+pip install pytorch-transformers
 ```
 
 If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (limit to version 4.4.3 if you are using Python 2) and `SpaCy` :
@@ -73,7 +73,7 @@ python -m spacy download en
 
 Again, if you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage).
 
-A series of tests is included in the [tests folder](https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/tests) and can be run using `pytest` (install pytest if needed: `pip install pytest`).
+A series of tests is included in the [tests folder](https://github.com/huggingface/pytorch-transformers/tree/master/tests) and can be run using `pytest` (install pytest if needed: `pip install pytest`).
 
 You can run the tests with the command:
 ```bash
@@ -84,51 +84,51 @@ python -m pytest -sv tests/
 
 This package comprises the following classes that can be imported in Python and are detailed in the [Doc](#doc) section of this readme:
 
-- Eight **Bert** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling.py`](./pytorch_pretrained_bert/modeling.py) file):
-  - [`BertModel`](./pytorch_pretrained_bert/modeling.py#L639) - raw BERT Transformer model (**fully pre-trained**),
-  - [`BertForMaskedLM`](./pytorch_pretrained_bert/modeling.py#L793) - BERT Transformer with the pre-trained masked language modeling head on top (**fully pre-trained**),
-  - [`BertForNextSentencePrediction`](./pytorch_pretrained_bert/modeling.py#L854) - BERT Transformer with the pre-trained next sentence prediction classifier on top  (**fully pre-trained**),
-  - [`BertForPreTraining`](./pytorch_pretrained_bert/modeling.py#L722) - BERT Transformer with masked language modeling head and next sentence prediction classifier on top (**fully pre-trained**),
-  - [`BertForSequenceClassification`](./pytorch_pretrained_bert/modeling.py#L916) - BERT Transformer with a sequence classification head on top (BERT Transformer is **pre-trained**, the sequence classification head **is only initialized and has to be trained**),
-  - [`BertForMultipleChoice`](./pytorch_pretrained_bert/modeling.py#L982) - BERT Transformer with a multiple choice head on top (used for task like Swag) (BERT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
-  - [`BertForTokenClassification`](./pytorch_pretrained_bert/modeling.py#L1051) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**),
-  - [`BertForQuestionAnswering`](./pytorch_pretrained_bert/modeling.py#L1124) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**).
+- Eight **Bert** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling.py`](./pytorch_transformers/modeling.py) file):
+  - [`BertModel`](./pytorch_transformers/modeling.py#L639) - raw BERT Transformer model (**fully pre-trained**),
+  - [`BertForMaskedLM`](./pytorch_transformers/modeling.py#L793) - BERT Transformer with the pre-trained masked language modeling head on top (**fully pre-trained**),
+  - [`BertForNextSentencePrediction`](./pytorch_transformers/modeling.py#L854) - BERT Transformer with the pre-trained next sentence prediction classifier on top  (**fully pre-trained**),
+  - [`BertForPreTraining`](./pytorch_transformers/modeling.py#L722) - BERT Transformer with masked language modeling head and next sentence prediction classifier on top (**fully pre-trained**),
+  - [`BertForSequenceClassification`](./pytorch_transformers/modeling.py#L916) - BERT Transformer with a sequence classification head on top (BERT Transformer is **pre-trained**, the sequence classification head **is only initialized and has to be trained**),
+  - [`BertForMultipleChoice`](./pytorch_transformers/modeling.py#L982) - BERT Transformer with a multiple choice head on top (used for task like Swag) (BERT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
+  - [`BertForTokenClassification`](./pytorch_transformers/modeling.py#L1051) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**),
+  - [`BertForQuestionAnswering`](./pytorch_transformers/modeling.py#L1124) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**).
 
-- Three **OpenAI GPT** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py) file):
-  - [`OpenAIGPTModel`](./pytorch_pretrained_bert/modeling_openai.py#L536) - raw OpenAI GPT Transformer model (**fully pre-trained**),
-  - [`OpenAIGPTLMHeadModel`](./pytorch_pretrained_bert/modeling_openai.py#L643) - OpenAI GPT Transformer with the tied language modeling head on top (**fully pre-trained**),
-  - [`OpenAIGPTDoubleHeadsModel`](./pytorch_pretrained_bert/modeling_openai.py#L722) - OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
+- Three **OpenAI GPT** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_openai.py`](./pytorch_transformers/modeling_openai.py) file):
+  - [`OpenAIGPTModel`](./pytorch_transformers/modeling_openai.py#L536) - raw OpenAI GPT Transformer model (**fully pre-trained**),
+  - [`OpenAIGPTLMHeadModel`](./pytorch_transformers/modeling_openai.py#L643) - OpenAI GPT Transformer with the tied language modeling head on top (**fully pre-trained**),
+  - [`OpenAIGPTDoubleHeadsModel`](./pytorch_transformers/modeling_openai.py#L722) - OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
 
-- Two **Transformer-XL** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py) file):
-  - [`TransfoXLModel`](./pytorch_pretrained_bert/modeling_transfo_xl.py#L983) - Transformer-XL model which outputs the last hidden state and memory cells (**fully pre-trained**),
-  - [`TransfoXLLMHeadModel`](./pytorch_pretrained_bert/modeling_transfo_xl.py#L1260) - Transformer-XL with the tied adaptive softmax head on top for language modeling which outputs the logits/loss and memory cells (**fully pre-trained**),
+- Two **Transformer-XL** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_transfo_xl.py`](./pytorch_transformers/modeling_transfo_xl.py) file):
+  - [`TransfoXLModel`](./pytorch_transformers/modeling_transfo_xl.py#L983) - Transformer-XL model which outputs the last hidden state and memory cells (**fully pre-trained**),
+  - [`TransfoXLLMHeadModel`](./pytorch_transformers/modeling_transfo_xl.py#L1260) - Transformer-XL with the tied adaptive softmax head on top for language modeling which outputs the logits/loss and memory cells (**fully pre-trained**),
 
-- Three **OpenAI GPT-2** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_gpt2.py`](./pytorch_pretrained_bert/modeling_gpt2.py) file):
-  - [`GPT2Model`](./pytorch_pretrained_bert/modeling_gpt2.py#L479) - raw OpenAI GPT-2 Transformer model (**fully pre-trained**),
-  - [`GPT2LMHeadModel`](./pytorch_pretrained_bert/modeling_gpt2.py#L559) - OpenAI GPT-2 Transformer with the tied language modeling head on top (**fully pre-trained**),
-  - [`GPT2DoubleHeadsModel`](./pytorch_pretrained_bert/modeling_gpt2.py#L624) - OpenAI GPT-2 Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT-2 Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
+- Three **OpenAI GPT-2** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_gpt2.py`](./pytorch_transformers/modeling_gpt2.py) file):
+  - [`GPT2Model`](./pytorch_transformers/modeling_gpt2.py#L479) - raw OpenAI GPT-2 Transformer model (**fully pre-trained**),
+  - [`GPT2LMHeadModel`](./pytorch_transformers/modeling_gpt2.py#L559) - OpenAI GPT-2 Transformer with the tied language modeling head on top (**fully pre-trained**),
+  - [`GPT2DoubleHeadsModel`](./pytorch_transformers/modeling_gpt2.py#L624) - OpenAI GPT-2 Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT-2 Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
 
-- Tokenizers for **BERT** (using word-piece) (in the [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) file):
+- Tokenizers for **BERT** (using word-piece) (in the [`tokenization.py`](./pytorch_transformers/tokenization.py) file):
   - `BasicTokenizer` - basic tokenization (punctuation splitting, lower casing, etc.),
   - `WordpieceTokenizer` - WordPiece tokenization,
   - `BertTokenizer` - perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
 
-- Tokenizer for **OpenAI GPT** (using Byte-Pair-Encoding) (in the [`tokenization_openai.py`](./pytorch_pretrained_bert/tokenization_openai.py) file):
+- Tokenizer for **OpenAI GPT** (using Byte-Pair-Encoding) (in the [`tokenization_openai.py`](./pytorch_transformers/tokenization_openai.py) file):
   - `OpenAIGPTTokenizer` - perform Byte-Pair-Encoding (BPE) tokenization.
 
-- Tokenizer for **Transformer-XL** (word tokens ordered by frequency for adaptive softmax) (in the [`tokenization_transfo_xl.py`](./pytorch_pretrained_bert/tokenization_transfo_xl.py) file):
+- Tokenizer for **Transformer-XL** (word tokens ordered by frequency for adaptive softmax) (in the [`tokenization_transfo_xl.py`](./pytorch_transformers/tokenization_transfo_xl.py) file):
   - `OpenAIGPTTokenizer` - perform word tokenization and can order words by frequency in a corpus for use in an adaptive softmax.
 
-- Tokenizer for **OpenAI GPT-2** (using byte-level Byte-Pair-Encoding) (in the [`tokenization_gpt2.py`](./pytorch_pretrained_bert/tokenization_gpt2.py) file):
+- Tokenizer for **OpenAI GPT-2** (using byte-level Byte-Pair-Encoding) (in the [`tokenization_gpt2.py`](./pytorch_transformers/tokenization_gpt2.py) file):
   - `GPT2Tokenizer` - perform byte-level Byte-Pair-Encoding (BPE) tokenization.
 
-- Optimizer for **BERT** (in the [`optimization.py`](./pytorch_pretrained_bert/optimization.py) file):
+- Optimizer for **BERT** (in the [`optimization.py`](./pytorch_transformers/optimization.py) file):
   - `BertAdam` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
-- Optimizer for **OpenAI GPT** (in the [`optimization_openai.py`](./pytorch_pretrained_bert/optimization_openai.py) file):
+- Optimizer for **OpenAI GPT** (in the [`optimization_openai.py`](./pytorch_transformers/optimization_openai.py) file):
   - `OpenAIAdam` - OpenAI GPT version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
-- Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective [`modeling.py`](./pytorch_pretrained_bert/modeling.py), [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py), [`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py) files):
+- Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective [`modeling.py`](./pytorch_transformers/modeling.py), [`modeling_openai.py`](./pytorch_transformers/modeling_openai.py), [`modeling_transfo_xl.py`](./pytorch_transformers/modeling_transfo_xl.py) files):
   - `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilities to read and write from JSON configuration files.
   - `OpenAIGPTConfig` - Configuration class to store the configuration of a `OpenAIGPTModel` with utilities to read and write from JSON configuration files.
   - `GPT2Config` - Configuration class to store the configuration of a `GPT2Model` with utilities to read and write from JSON configuration files.
@@ -175,7 +175,7 @@ First let's prepare a tokenized input with `BertTokenizer`
 
 ```python
 import torch
-from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
+from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
 
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
@@ -252,7 +252,7 @@ First let's prepare a tokenized input with `OpenAIGPTTokenizer`
 
 ```python
 import torch
-from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
+from pytorch_transformers import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
 
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
@@ -339,7 +339,7 @@ First let's prepare a tokenized input with `TransfoXLTokenizer`
 
 ```python
 import torch
-from pytorch_pretrained_bert import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel
+from pytorch_transformers import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel
 
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
@@ -414,7 +414,7 @@ First let's prepare a tokenized input with `GPT2Tokenizer`
 
 ```python
 import torch
-from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
+from pytorch_transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
 
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
@@ -552,7 +552,7 @@ where
     - `bert_config.json` or `openai_gpt_config.json` a configuration file for the model, and
     - `pytorch_model.bin` a PyTorch dump of a pre-trained instance of `BertForPreTraining`, `OpenAIGPTModel`, `TransfoXLModel`, `GPT2LMHeadModel` (saved with the usual `torch.save()`)
 
-  If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_pretrained_bert/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_pretrained_bert/`).
+  If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_transformers/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_transformers/`).
 
 - `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information).
 - `from_tf`: should we load the weights from a locally saved TensorFlow checkpoint
@@ -586,19 +586,19 @@ model = GPT2Model.from_pretrained('gpt2')
 
 #### Cache directory
 
-`pytorch_pretrained_bert` save the pretrained weights in a cache directory which is located at (in this order of priority):
+`pytorch_transformers` save the pretrained weights in a cache directory which is located at (in this order of priority):
 
 - `cache_dir` optional arguments to the `from_pretrained()` method (see above),
 - shell environment variable `PYTORCH_PRETRAINED_BERT_CACHE`,
-- PyTorch cache home + `/pytorch_pretrained_bert/`
+- PyTorch cache home + `/pytorch_transformers/`
   where PyTorch cache home is defined by (in this order):
   - shell environment variable `ENV_TORCH_HOME`
   - shell environment variable `ENV_XDG_CACHE_HOME` + `/torch/`)
   - default: `~/.cache/torch/`
 
-Usually, if you don't set any specific environment variable, `pytorch_pretrained_bert` cache will be at `~/.cache/torch/pytorch_pretrained_bert/`.
+Usually, if you don't set any specific environment variable, `pytorch_transformers` cache will be at `~/.cache/torch/pytorch_transformers/`.
 
-You can alsways safely delete `pytorch_pretrained_bert` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
+You can alsways safely delete `pytorch_transformers` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
 
 ### Serialization best-practices
 
@@ -621,7 +621,7 @@ The *default filenames* of these files are as follow:
 Here is the recommended way of saving the model, configuration and vocabulary to an `output_dir` directory and reloading the model and tokenizer afterwards:
 
 ```python
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
 
 output_dir = "./models/"
 
@@ -719,7 +719,7 @@ The model can be instantiated with the following arguments:
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.
 
 We detail them here. This model takes as *inputs*:
-[`modeling.py`](./pytorch_pretrained_bert/modeling.py)
+[`modeling.py`](./pytorch_transformers/modeling.py)
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary (see the tokens preprocessing logic in the scripts [`run_bert_extract_features.py`](./examples/run_bert_extract_features.py), [`run_bert_classifier.py`](./examples/run_bert_classifier.py) and [`run_bert_squad.py`](./examples/run_bert_squad.py)), and
 - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
 - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. It's a mask to be used if some input sequence lengths are smaller than the max input sequence length of the current batch. It's the mask that we typically use for attention when a batch has varying length sentences.
@@ -852,7 +852,7 @@ The model can be instantiated with the following arguments:
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.
 
 We detail them here. This model takes as *inputs*:
-[`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py)
+[`modeling_openai.py`](./pytorch_transformers/modeling_openai.py)
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
 - `position_ids`: an optional torch.LongTensor with the same shape as input_ids
     with the position indices (selected in the range [0, config.n_positions - 1[.
@@ -905,7 +905,7 @@ Transformer XL use a relative positioning with sinusiodal patterns and adaptive
 - the tokens in the vocabulary have to be sorted to decreasing frequency.
 
 This model takes as *inputs*:
-[`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py)
+[`modeling_transfo_xl.py`](./pytorch_transformers/modeling_transfo_xl.py)
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the token indices selected in the range [0, self.config.n_token[
 - `mems`: an optional memory of hidden states from previous forward passes as a list (num layers) of hidden states at the entry of each layer. Each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
 
@@ -952,7 +952,7 @@ The model can be instantiated with the following arguments:
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.
 
 We detail them here. This model takes as *inputs*:
-[`modeling_gpt2.py`](./pytorch_pretrained_bert/modeling_gpt2.py)
+[`modeling_gpt2.py`](./pytorch_transformers/modeling_gpt2.py)
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, vocab_size[
 - `position_ids`: an optional torch.LongTensor with the same shape as input_ids
     with the position indices (selected in the range [0, config.n_positions - 1[.
@@ -1020,7 +1020,7 @@ and three methods:
 - `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
 - `save_vocabulary(directory_path)`: save the vocabulary file to `directory_path`. Return the path to the saved vocabulary file: `vocab_file_path`. The vocabulary can be reloaded with `BertTokenizer.from_pretrained('vocab_file_path')` or `BertTokenizer.from_pretrained('directory_path')`.
 
-Please refer to the doc strings and code in [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) for the details of the `BasicTokenizer` and `WordpieceTokenizer` classes. In general it is recommended to use `BertTokenizer` unless you know what you are doing.
+Please refer to the doc strings and code in [`tokenization.py`](./pytorch_transformers/tokenization.py) for the details of the `BasicTokenizer` and `WordpieceTokenizer` classes. In general it is recommended to use `BertTokenizer` unless you know what you are doing.
 
 #### `OpenAIGPTTokenizer`
 
@@ -1043,7 +1043,7 @@ and five methods:
 - `decode(ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)`: decode a list of `int` indices in a string and do some post-processing if needed: (i) remove special tokens from the output and (ii) clean up tokenization spaces.
 - `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`.
 
-Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch_pretrained_bert/tokenization_openai.py) for the details of the `OpenAIGPTTokenizer`.
+Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch_transformers/tokenization_openai.py) for the details of the `OpenAIGPTTokenizer`.
 
 #### `TransfoXLTokenizer`
 
@@ -1051,7 +1051,7 @@ Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch
 
 The API is similar to the API of `BertTokenizer` (see above).
 
-Please refer to the doc strings and code in [`tokenization_transfo_xl.py`](./pytorch_pretrained_bert/tokenization_transfo_xl.py) for the details of these additional methods in `TransfoXLTokenizer`.
+Please refer to the doc strings and code in [`tokenization_transfo_xl.py`](./pytorch_transformers/tokenization_transfo_xl.py) for the details of these additional methods in `TransfoXLTokenizer`.
 
 #### `GPT2Tokenizer`
 
@@ -1073,7 +1073,7 @@ and two methods:
 - `decode(tokens)`: convert back a list of `int` tokens in a `str`.
 - `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`.
 
-Please refer to [`tokenization_gpt2.py`](./pytorch_pretrained_bert/tokenization_gpt2.py) for more details on the `GPT2Tokenizer`.
+Please refer to [`tokenization_gpt2.py`](./pytorch_transformers/tokenization_gpt2.py) for more details on the `GPT2Tokenizer`.
 
 ### Optimizers
 
@@ -1155,7 +1155,7 @@ Here is how to use these techniques in our scripts:
 - **Distributed training**: Distributed training can be activated by supplying an integer greater or equal to 0 to the `--local_rank` argument (see below).
 - **16-bits training**: 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found [here](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) and a full documentation is [here](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html). In our scripts, this option can be activated by setting the `--fp16` flag and you can play with loss scaling using the `--loss_scale` flag (see the previously linked documentation for details on loss scaling). The loss scale can be zero in which case the scale is dynamically adjusted or a positive power of two in which case the scaling is static.
 
-To use 16-bits training and distributed training, you need to install NVIDIA's apex extension [as detailed here](https://github.com/nvidia/apex). You will find more information regarding the internals of `apex` and how to use `apex` in [the doc and the associated repository](https://github.com/nvidia/apex). The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in [the relevant PR of the present repository](https://github.com/huggingface/pytorch-pretrained-BERT/pull/116).
+To use 16-bits training and distributed training, you need to install NVIDIA's apex extension [as detailed here](https://github.com/nvidia/apex). You will find more information regarding the internals of `apex` and how to use `apex` in [the doc and the associated repository](https://github.com/nvidia/apex). The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in [the relevant PR of the present repository](https://github.com/huggingface/pytorch-transformers/pull/116).
 
 Note: To use *Distributed Training*, you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see [the above mentioned blog post]((https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255)) for more details):
 ```bash
@@ -1660,7 +1660,7 @@ To help you understand and use these features, we have added a specific example
 
 ## Notebooks
 
-We include [three Jupyter Notebooks](https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/notebooks) that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
+We include [three Jupyter Notebooks](https://github.com/huggingface/pytorch-transformers/tree/master/notebooks) that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
 
 - The first NoteBook ([Comparing-TF-and-PT-models.ipynb](./notebooks/Comparing-TF-and-PT-models.ipynb)) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
 
@@ -1676,7 +1676,7 @@ A command-line interface is provided to convert a TensorFlow checkpoint in a PyT
 
 ### BERT
 
-You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`convert_tf_checkpoint_to_pytorch.py`](./pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py ) script.
+You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`convert_tf_checkpoint_to_pytorch.py`](./pytorch_transformers/convert_tf_checkpoint_to_pytorch.py ) script.
 
 This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using `torch.load()` (see examples in [`run_bert_extract_features.py`](./examples/run_bert_extract_features.py), [`run_bert_classifier.py`](./examples/run_bert_classifier.py) and [`run_bert_squad.py`](./examples/run_bert_squad.py)).
 
@@ -1689,7 +1689,7 @@ Here is an example of the conversion process for a pre-trained `BERT-Base Uncase
 ```shell
 export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
 
-pytorch_pretrained_bert bert \
+pytorch_transformers bert \
   $BERT_BASE_DIR/bert_model.ckpt \
   $BERT_BASE_DIR/bert_config.json \
   $BERT_BASE_DIR/pytorch_model.bin
@@ -1704,7 +1704,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
 ```shell
 export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
 
-pytorch_pretrained_bert gpt \
+pytorch_transformers gpt \
   $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
   $PYTORCH_DUMP_OUTPUT \
   [OPENAI_GPT_CONFIG]
@@ -1717,7 +1717,7 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
 ```shell
 export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
 
-pytorch_pretrained_bert transfo_xl \
+pytorch_transformers transfo_xl \
   $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
   $PYTORCH_DUMP_OUTPUT \
   [TRANSFO_XL_CONFIG]
@@ -1730,7 +1730,7 @@ Here is an example of the conversion process for a pre-trained OpenAI's GPT-2 mo
 ```shell
 export GPT2_DIR=/path/to/gpt2/checkpoint
 
-pytorch_pretrained_bert gpt2 \
+pytorch_transformers gpt2 \
   $GPT2_DIR/model.ckpt \
   $PYTORCH_DUMP_OUTPUT \
   [GPT2_CONFIG]
@@ -1744,7 +1744,7 @@ Here is an example of the conversion process for a pre-trained XLNet model, fine
 export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
 export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
 
-pytorch_pretrained_bert xlnet \
+pytorch_transformers xlnet \
   $TRANSFO_XL_CHECKPOINT_PATH \
   $TRANSFO_XL_CONFIG_PATH \
   $PYTORCH_DUMP_OUTPUT \
diff --git a/docker/Dockerfile b/docker/Dockerfile
index e47eb548f9..1a6c6f06f9 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -2,6 +2,6 @@ FROM pytorch/pytorch:latest
 
 RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
 
-RUN pip install pytorch-pretrained-bert
+RUN pip install pytorch_transformers
 
 WORKDIR /workspace
\ No newline at end of file
diff --git a/examples/bertology.py b/examples/bertology.py
index 6f7f7c9592..096b1b44fc 100644
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -12,7 +12,7 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse
 from torch.utils.data.distributed import DistributedSampler
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from pytorch_pretrained_bert import BertForSequenceClassification, BertTokenizer
+from pytorch_transformers import BertForSequenceClassification, BertTokenizer
 
 from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
 
diff --git a/examples/generation_xlnet.py b/examples/generation_xlnet.py
index e54f6a365f..fe3610cfd1 100644
--- a/examples/generation_xlnet.py
+++ b/examples/generation_xlnet.py
@@ -1,6 +1,6 @@
 import torch
 from torch.nn import functional as F
-from pytorch_pretrained_bert import XLNetModel, XLNetLMHeadModel, XLNetTokenizer
+from pytorch_transformers import XLNetModel, XLNetLMHeadModel, XLNetTokenizer
 
 import logging
 logging.basicConfig(level=logging.INFO)
diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 8eda2aa5c5..505cd466f6 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -13,10 +13,10 @@ from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_bert import BertForPreTraining
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForPreTraining
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")
 
diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py
index c2211c88e6..b79257fd4b 100644
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -5,7 +5,7 @@ from tempfile import TemporaryDirectory
 import shelve
 
 from random import random, randrange, randint, shuffle, choice
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
+from pytorch_transformers.tokenization_bert import BertTokenizer
 import numpy as np
 import json
 import collections
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index bcfd138442..3008787cd1 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -29,10 +29,10 @@ from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_bert import BertForPreTraining
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForPreTraining
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt='%m/%d/%Y %H:%M:%S',
diff --git a/examples/run_bert_classifier.py b/examples/run_bert_classifier.py
index 233a7ee5d1..506aecc5b1 100644
--- a/examples/run_bert_classifier.py
+++ b/examples/run_bert_classifier.py
@@ -34,10 +34,10 @@ from torch.nn import CrossEntropyLoss, MSELoss
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_bert import BertForSequenceClassification
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForSequenceClassification
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
 
diff --git a/examples/run_bert_extract_features.py b/examples/run_bert_extract_features.py
index 2a550c431a..cc7dedd6af 100644
--- a/examples/run_bert_extract_features.py
+++ b/examples/run_bert_extract_features.py
@@ -28,8 +28,8 @@ import torch
 from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
-from pytorch_pretrained_bert.modeling_bert import BertModel
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.modeling_bert import BertModel
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                     datefmt = '%m/%d/%Y %H:%M:%S',
diff --git a/examples/run_bert_squad.py b/examples/run_bert_squad.py
index f8eee9c8eb..c3fdb06316 100644
--- a/examples/run_bert_squad.py
+++ b/examples/run_bert_squad.py
@@ -33,10 +33,10 @@ from tqdm import tqdm, trange
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_bert import BertForQuestionAnswering
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForQuestionAnswering
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers.tokenization_bert import BertTokenizer
 
 from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
 
diff --git a/examples/run_bert_swag.py b/examples/run_bert_swag.py
index 3e45225891..00cd3a7840 100644
--- a/examples/run_bert_swag.py
+++ b/examples/run_bert_swag.py
@@ -32,10 +32,10 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_bert import BertForMultipleChoice, BertConfig
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
+from pytorch_transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForMultipleChoice, BertConfig
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers.tokenization_bert import BertTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py
index 8f8208bbcd..a759e449f9 100644
--- a/examples/run_gpt2.py
+++ b/examples/run_gpt2.py
@@ -8,7 +8,7 @@ import torch
 import torch.nn.functional as F
 import numpy as np
 
-from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer
+from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index ac5c474491..02b86b3a22 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -39,7 +39,7 @@ import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 
-from pytorch_pretrained_bert import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
+from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
                                      OpenAIAdam, cached_path, WEIGHTS_NAME, CONFIG_NAME)
 
 ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
diff --git a/examples/run_transfo_xl.py b/examples/run_transfo_xl.py
index 0ea7b32053..fda0d8dc28 100644
--- a/examples/run_transfo_xl.py
+++ b/examples/run_transfo_xl.py
@@ -28,7 +28,7 @@ import math
 
 import torch
 
-from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
+from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py
index e30cad773b..7cf8a8d877 100644
--- a/examples/run_xlnet_classifier.py
+++ b/examples/run_xlnet_classifier.py
@@ -34,10 +34,10 @@ from torch.nn import CrossEntropyLoss, MSELoss
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_xlnet import XLNetForSequenceClassification
-from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_xlnet import XLNetForSequenceClassification
+from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
 
diff --git a/examples/run_xlnet_squad.py b/examples/run_xlnet_squad.py
index c299358b79..393fa98abd 100644
--- a/examples/run_xlnet_squad.py
+++ b/examples/run_xlnet_squad.py
@@ -33,10 +33,10 @@ from tqdm import tqdm, trange
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_xlnet import BertForQuestionAnswering
-from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_xlnet import BertForQuestionAnswering
+from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
 
diff --git a/examples/tests/examples_tests.py b/examples/tests/examples_tests.py
new file mode 100644
index 0000000000..120df35f82
--- /dev/null
+++ b/examples/tests/examples_tests.py
@@ -0,0 +1,50 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc..
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+import json
+import random
+import shutil
+import pytest
+
+import torch
+
+from pytorch_transformers import PretrainedConfig, PreTrainedModel
+from pytorch_transformers.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP, PRETRAINED_CONFIG_ARCHIVE_MAP
+
+
+class ModelUtilsTest(unittest.TestCase):
+    def test_model_from_pretrained(self):
+        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = BertConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, PretrainedConfig)
+
+            model = BertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, PreTrainedModel)
+
+            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            self.assertEqual(model.config.output_attentions, True)
+            self.assertEqual(model.config.output_hidden_states, True)
+            self.assertEqual(model.config, config)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/examples/utils_squad.py b/examples/utils_squad.py
index 0dfecd202c..c858776183 100644
--- a/examples/utils_squad.py
+++ b/examples/utils_squad.py
@@ -24,7 +24,7 @@ import math
 import collections
 from io import open
 
-from pytorch_pretrained_bert.tokenization_bert import BasicTokenizer, whitespace_tokenize
+from pytorch_transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
 
 logger = logging.getLogger(__name__)
 
diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
index 94c7a18a30..0ee0df6697 100644
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
-from pytorch_pretrained_bert.modeling_bert import (
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.modeling_bert import (
         BertModel,
         BertForNextSentencePrediction,
         BertForMaskedLM,
@@ -86,7 +86,7 @@ def bertTokenizer(*args, **kwargs):
     Example:
         >>> import torch
         >>> sentence = 'Hello, World!'
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         >>> toks = tokenizer.tokenize(sentence)
         ['Hello', '##,', 'World', '##!']
         >>> ids = tokenizer.convert_tokens_to_ids(toks)
@@ -106,7 +106,7 @@ def bertModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -115,7 +115,7 @@ def bertModel(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
         >>> model.eval()
         # Predict hidden states features for each layer
         >>> with torch.no_grad():
@@ -135,7 +135,7 @@ def bertForNextSentencePrediction(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -144,7 +144,7 @@ def bertForNextSentencePrediction(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForNextSentencePrediction
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForNextSentencePrediction', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
         >>> model.eval()
         # Predict the next sentence classification logits
         >>> with torch.no_grad():
@@ -165,7 +165,7 @@ def bertForPreTraining(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -173,7 +173,7 @@ def bertForPreTraining(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForPreTraining
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForPreTraining', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
         >>> masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
     """
     model = BertForPreTraining.from_pretrained(*args, **kwargs)
@@ -189,7 +189,7 @@ def bertForMaskedLM(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -200,7 +200,7 @@ def bertForMaskedLM(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForMaskedLM
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
         >>> model.eval()
         # Predict all tokens
         >>> with torch.no_grad():
@@ -231,7 +231,7 @@ def bertForSequenceClassification(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -240,7 +240,7 @@ def bertForSequenceClassification(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForSequenceClassification
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
         >>> model.eval()
         # Predict the sequence classification logits
         >>> with torch.no_grad():
@@ -266,7 +266,7 @@ def bertForMultipleChoice(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -275,7 +275,7 @@ def bertForMultipleChoice(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
         >>> segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
         # Load bertForMultipleChoice
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
         >>> model.eval()
         # Predict the multiple choice logits
         >>> with torch.no_grad():
@@ -299,7 +299,7 @@ def bertForQuestionAnswering(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -308,7 +308,7 @@ def bertForQuestionAnswering(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForQuestionAnswering
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForQuestionAnswering', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
         >>> model.eval()
         # Predict the start and end positions logits
         >>> with torch.no_grad():
@@ -338,7 +338,7 @@ def bertForTokenClassification(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -347,7 +347,7 @@ def bertForTokenClassification(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForTokenClassification
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
         >>> model.eval()
         # Predict the token classification logits
         >>> with torch.no_grad():
diff --git a/hubconfs/gpt2_hubconf.py b/hubconfs/gpt2_hubconf.py
index 3ac8bc72ab..dbaa2cd612 100644
--- a/hubconfs/gpt2_hubconf.py
+++ b/hubconfs/gpt2_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer
-from pytorch_pretrained_bert.modeling_gpt2 import (
+from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
+from pytorch_transformers.modeling_gpt2 import (
     GPT2Model,
     GPT2LMHeadModel,
     GPT2DoubleHeadsModel
@@ -53,7 +53,7 @@ def gpt2Tokenizer(*args, **kwargs):
 
     Example:
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
 
         >>> text = "Who was Jim Henson ?"
         >>> indexed_tokens = tokenizer.encode(tokenized_text)
@@ -72,7 +72,7 @@ def gpt2Model(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -83,7 +83,7 @@ def gpt2Model(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load gpt2Model
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Model', 'gpt2')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Model', 'gpt2')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -105,7 +105,7 @@ def gpt2LMHeadModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -116,7 +116,7 @@ def gpt2LMHeadModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load gpt2LMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2LMHeadModel', 'gpt2')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -144,7 +144,7 @@ def gpt2DoubleHeadsModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
 
         #  Prepare tokenized input
         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -157,7 +157,7 @@ def gpt2DoubleHeadsModel(*args, **kwargs):
         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
         # Load gpt2DoubleHeadsModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2DoubleHeadsModel', 'gpt2')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2DoubleHeadsModel', 'gpt2')
         >>> model.eval()
 
         # Predict hidden states features for each layer
diff --git a/hubconfs/gpt_hubconf.py b/hubconfs/gpt_hubconf.py
index f3d03888ae..1683c881fa 100644
--- a/hubconfs/gpt_hubconf.py
+++ b/hubconfs/gpt_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
-from pytorch_pretrained_bert.modeling_openai import (
+from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
+from pytorch_transformers.modeling_openai import (
 	OpenAIGPTModel,
 	OpenAIGPTLMHeadModel,
 	OpenAIGPTDoubleHeadsModel
@@ -77,7 +77,7 @@ def openAIGPTTokenizer(*args, **kwargs):
 
     Example:
 		>>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 		
 		>>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -98,7 +98,7 @@ def openAIGPTModel(*args, **kwargs):
     Example:
         # Load the tokenizer
 		>>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 
         #  Prepare tokenized input
         >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -107,7 +107,7 @@ def openAIGPTModel(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
 
         # Load openAIGPTModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTModel', 'openai-gpt')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTModel', 'openai-gpt')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -127,7 +127,7 @@ def openAIGPTLMHeadModel(*args, **kwargs):
 	Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 
         #  Prepare tokenized input
         >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -136,7 +136,7 @@ def openAIGPTLMHeadModel(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
 
         # Load openAIGPTLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTLMHeadModel', 'openai-gpt')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTLMHeadModel', 'openai-gpt')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -162,7 +162,7 @@ def openAIGPTDoubleHeadsModel(*args, **kwargs):
 	Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 
         #  Prepare tokenized input
         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -175,7 +175,7 @@ def openAIGPTDoubleHeadsModel(*args, **kwargs):
         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
         # Load openAIGPTDoubleHeadsModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
         >>> model.eval()
 
         # Predict hidden states features for each layer
diff --git a/hubconfs/transformer_xl_hubconf.py b/hubconfs/transformer_xl_hubconf.py
index d5c697547e..d89db894ad 100644
--- a/hubconfs/transformer_xl_hubconf.py
+++ b/hubconfs/transformer_xl_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer
-from pytorch_pretrained_bert.modeling_transfo_xl import (
+from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer
+from pytorch_transformers.modeling_transfo_xl import (
     TransfoXLModel,
     TransfoXLLMHeadModel
 )
@@ -46,7 +46,7 @@ def transformerXLTokenizer(*args, **kwargs):
 
     Example:
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
         
         >>> text = "Who was Jim Henson ?"
         >>> tokenized_text = tokenizer.tokenize(tokenized_text)
@@ -64,7 +64,7 @@ def transformerXLModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -77,7 +77,7 @@ def transformerXLModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load transformerXLModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLModel', 'transfo-xl-wt103')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLModel', 'transfo-xl-wt103')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -99,7 +99,7 @@ def transformerXLLMHeadModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -112,7 +112,7 @@ def transformerXLLMHeadModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load transformerXLLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
         >>> model.eval()
 
         # Predict hidden states features for each layer
diff --git a/hubconfs/xlm_hubconf.py b/hubconfs/xlm_hubconf.py
index 154f875bfb..4f6fd93c24 100644
--- a/hubconfs/xlm_hubconf.py
+++ b/hubconfs/xlm_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_xlm import XLMTokenizer
-from pytorch_pretrained_bert.modeling_xlm import (
+from pytorch_transformers.tokenization_xlm import XLMTokenizer
+from pytorch_transformers.modeling_xlm import (
     XLMConfig,
     XLMModel,
     XLMWithLMHeadModel,
@@ -18,7 +18,7 @@ xlm_start_docstring = """
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlmTokenizer', 'xlm-mlm-en-2048')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -77,7 +77,7 @@ def xlmTokenizer(*args, **kwargs):
 
     Example:
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlmTokenizer', 'xlm-mlm-en-2048')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
 
         >>> text = "Who was Jim Henson ?"
         >>> indexed_tokens = tokenizer.encode(tokenized_text)
@@ -91,7 +91,7 @@ def xlmTokenizer(*args, **kwargs):
 def xlmModel(*args, **kwargs):
     """
         # Load xlmModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlmModel', 'xlm-mlm-en-2048')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -116,7 +116,7 @@ def xlmLMHeadModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load xlnetLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -143,7 +143,7 @@ def xlmLMHeadModel(*args, **kwargs):
 #     Example:
 #         # Load the tokenizer
 #         >>> import torch
-#         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlm-mlm-en-2048')
+#         >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
 
 #         #  Prepare tokenized input
 #         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -156,7 +156,7 @@ def xlmLMHeadModel(*args, **kwargs):
 #         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
 #         # Load xlnetForSequenceClassification
-#         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
+#         >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
 #         >>> model.eval()
 
 #         # Predict sequence classes logits
diff --git a/hubconfs/xlnet_hubconf.1.py b/hubconfs/xlnet_hubconf.1.py
index d3766d04e0..4c5105a241 100644
--- a/hubconfs/xlnet_hubconf.1.py
+++ b/hubconfs/xlnet_hubconf.1.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
-from pytorch_pretrained_bert.modeling_xlnet import (
+from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
+from pytorch_transformers.modeling_xlnet import (
     XLNetConfig,
     XLNetModel,
     XLNetLMHeadModel,
@@ -54,7 +54,7 @@ def xlnetTokenizer(*args, **kwargs):
 
     Example:
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 
         >>> text = "Who was Jim Henson ?"
         >>> indexed_tokens = tokenizer.encode(tokenized_text)
@@ -73,7 +73,7 @@ def xlnetModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -84,7 +84,7 @@ def xlnetModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load xlnetModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetModel', 'xlnet-large-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -107,7 +107,7 @@ def xlnetLMHeadModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -118,7 +118,7 @@ def xlnetLMHeadModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load xlnetLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetLMHeadModel', 'xlnet-large-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -145,7 +145,7 @@ def xlnetLMHeadModel(*args, **kwargs):
 #     Example:
 #         # Load the tokenizer
 #         >>> import torch
-#         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+#         >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 
 #         #  Prepare tokenized input
 #         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -158,7 +158,7 @@ def xlnetLMHeadModel(*args, **kwargs):
 #         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
 #         # Load xlnetForSequenceClassification
-#         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetForSequenceClassification', 'xlnet-large-cased')
+#         >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
 #         >>> model.eval()
 
 #         # Predict sequence classes logits
diff --git a/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb b/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
index ea7271df96..809f6ea6e0 100644
--- a/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
+++ b/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
@@ -78,7 +78,7 @@
     "import importlib.util\n",
     "import sys\n",
     "import tensorflow as tf\n",
-    "import pytorch_pretrained_bert as ppb\n",
+    "import pytorch_transformers as ppb\n",
     "\n",
     "def del_all_flags(FLAGS):\n",
     "    flags_dict = FLAGS._flags()    \n",
@@ -3997,9 +3997,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "11/16/2018 11:03:05 - INFO - pytorch_pretrained_bert.modeling_bert -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/thomaswolf/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba\n",
-      "11/16/2018 11:03:05 - INFO - pytorch_pretrained_bert.modeling_bert -   extracting archive file /Users/thomaswolf/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpaqgsm566\n",
-      "11/16/2018 11:03:08 - INFO - pytorch_pretrained_bert.modeling_bert -   Model config {\n",
+      "11/16/2018 11:03:05 - INFO - pytorch_transformers.modeling_bert -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/thomaswolf/.pytorch_transformers/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba\n",
+      "11/16/2018 11:03:05 - INFO - pytorch_transformers.modeling_bert -   extracting archive file /Users/thomaswolf/.pytorch_transformers/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpaqgsm566\n",
+      "11/16/2018 11:03:08 - INFO - pytorch_transformers.modeling_bert -   Model config {\n",
       "  \"attention_probs_dropout_prob\": 0.1,\n",
       "  \"hidden_act\": \"gelu\",\n",
       "  \"hidden_dropout_prob\": 0.1,\n",
diff --git a/notebooks/Comparing-TF-and-PT-models.ipynb b/notebooks/Comparing-TF-and-PT-models.ipynb
index 3e438e2f55..b7382e4652 100644
--- a/notebooks/Comparing-TF-and-PT-models.ipynb
+++ b/notebooks/Comparing-TF-and-PT-models.ipynb
@@ -342,7 +342,7 @@
    "outputs": [],
    "source": [
     "import extract_features\n",
-    "import pytorch_pretrained_bert as ppb\n",
+    "import pytorch_transformers as ppb\n",
     "from extract_features import *"
    ]
   },
@@ -375,8 +375,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "11/15/2018 16:21:18 - INFO - pytorch_pretrained_bert.modeling_bert -   loading archive file ../../google_models/uncased_L-12_H-768_A-12/\n",
-      "11/15/2018 16:21:18 - INFO - pytorch_pretrained_bert.modeling_bert -   Model config {\n",
+      "11/15/2018 16:21:18 - INFO - pytorch_transformers.modeling_bert -   loading archive file ../../google_models/uncased_L-12_H-768_A-12/\n",
+      "11/15/2018 16:21:18 - INFO - pytorch_transformers.modeling_bert -   Model config {\n",
       "  \"attention_probs_dropout_prob\": 0.1,\n",
       "  \"hidden_act\": \"gelu\",\n",
       "  \"hidden_dropout_prob\": 0.1,\n",
diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_transformers/__init__.py
similarity index 98%
rename from pytorch_pretrained_bert/__init__.py
rename to pytorch_transformers/__init__.py
index 23346967ba..cbd007f872 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.2"
+__version__ = "0.7.0"
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
diff --git a/pytorch_pretrained_bert/__main__.py b/pytorch_transformers/__main__.py
similarity index 72%
rename from pytorch_pretrained_bert/__main__.py
rename to pytorch_transformers/__main__.py
index bb9534a830..95504c1493 100644
--- a/pytorch_pretrained_bert/__main__.py
+++ b/pytorch_transformers/__main__.py
@@ -4,24 +4,24 @@ def main():
     if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet"]:
         print(
         "Should be used as one of: \n"
-        ">> `pytorch_pretrained_bert bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
-        ">> `pytorch_pretrained_bert gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n"
-        ">> `pytorch_pretrained_bert transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n"
-        ">> `pytorch_pretrained_bert gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]` or \n"
-        ">> `pytorch_pretrained_bert xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
+        ">> `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
+        ">> `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n"
+        ">> `pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n"
+        ">> `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]` or \n"
+        ">> `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
     else:
         if sys.argv[1] == "bert":
             try:
                 from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
             if len(sys.argv) != 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+                print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
             else:
                 PYTORCH_DUMP_OUTPUT = sys.argv.pop()
                 TF_CONFIG = sys.argv.pop()
@@ -31,7 +31,7 @@ def main():
             from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
             if len(sys.argv) < 4 or len(sys.argv) > 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
+                print("Should be used as `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
             else:
                 OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -46,13 +46,13 @@ def main():
             try:
                 from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
             if len(sys.argv) < 4 or len(sys.argv) > 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+                print("Should be used as `pytorch_transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
             else:
                 if 'ckpt' in sys.argv[2].lower():
                     TF_CHECKPOINT = sys.argv[2]
@@ -70,14 +70,14 @@ def main():
             try:
                 from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
             if len(sys.argv) < 4 or len(sys.argv) > 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+                print("Should be used as `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
             else:
                 TF_CHECKPOINT = sys.argv[2]
                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -90,14 +90,14 @@ def main():
             try:
                 from .convert_xlnet_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
             if len(sys.argv) < 5 or len(sys.argv) > 6:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
+                print("Should be used as `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
             else:
                 TF_CHECKPOINT = sys.argv[2]
                 TF_CONFIG = sys.argv[3]
diff --git a/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
similarity index 97%
rename from pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
index 51d52a6694..86c8264cb5 100755
--- a/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from io import open
 
 import torch
 
-from pytorch_pretrained_bert.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
+from pytorch_transformers.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
                                                      GPT2Config,
                                                      GPT2Model,
                                                      load_tf_weights_in_gpt2)
diff --git a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
similarity index 97%
rename from pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
index 566008aaa0..68e9dea624 100755
--- a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from io import open
 
 import torch
 
-from pytorch_pretrained_bert.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
+from pytorch_transformers.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
                                                      OpenAIGPTConfig,
                                                      OpenAIGPTModel,
                                                      load_tf_weights_in_openai_gpt)
diff --git a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py b/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
similarity index 95%
rename from pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
index 42f7380969..7530d7e12d 100755
--- a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
@@ -25,7 +25,7 @@ import tensorflow as tf
 import torch
 import numpy as np
 
-from pytorch_pretrained_bert.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+from pytorch_transformers.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert
 
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
     # Initialise PyTorch model
diff --git a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
similarity index 96%
rename from pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
index 8d6b9651c7..2d666a1f03 100755
--- a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -23,13 +23,13 @@ from io import open
 
 import torch
 
-import pytorch_pretrained_bert.tokenization_transfo_xl as data_utils
-from pytorch_pretrained_bert.modeling_transfo_xl import (CONFIG_NAME,
+import pytorch_transformers.tokenization_transfo_xl as data_utils
+from pytorch_transformers.modeling_transfo_xl import (CONFIG_NAME,
                                                          WEIGHTS_NAME,
                                                          TransfoXLConfig,
                                                          TransfoXLLMHeadModel,
                                                          load_tf_weights_in_transfo_xl)
-from pytorch_pretrained_bert.tokenization_transfo_xl import (CORPUS_NAME,
+from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME,
                                                              VOCAB_NAME)
 
 if sys.version_info[0] == 2:
diff --git a/pytorch_pretrained_bert/convert_xlm_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
similarity index 93%
rename from pytorch_pretrained_bert/convert_xlm_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
index 44a40174b4..0cbe962cea 100755
--- a/pytorch_pretrained_bert/convert_xlm_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
@@ -23,8 +23,8 @@ from io import open
 import torch
 import numpy
 
-from pytorch_pretrained_bert.modeling_xlm import (CONFIG_NAME, WEIGHTS_NAME, XLMConfig, XLMModel)
-from pytorch_pretrained_bert.tokenization_xlm import MERGES_NAME, VOCAB_NAME
+from pytorch_transformers.modeling_xlm import (CONFIG_NAME, WEIGHTS_NAME, XLMConfig, XLMModel)
+from pytorch_transformers.tokenization_xlm import MERGES_NAME, VOCAB_NAME
 
 
 def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
diff --git a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
similarity index 98%
rename from pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
index ce4fcc7810..f41db87124 100755
--- a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
@@ -22,7 +22,7 @@ import os
 import argparse
 import torch
 
-from pytorch_pretrained_bert.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME,
+from pytorch_transformers.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME,
                                                     XLNetConfig,
                                                     XLNetLMHeadModel, XLNetForQuestionAnswering,
                                                     XLNetForSequenceClassification,
diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_transformers/file_utils.py
similarity index 99%
rename from pytorch_pretrained_bert/file_utils.py
rename to pytorch_transformers/file_utils.py
index 994f47d57c..1397bd416b 100644
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_transformers/file_utils.py
@@ -29,7 +29,7 @@ except ImportError:
     torch_cache_home = os.path.expanduser(
         os.getenv('TORCH_HOME', os.path.join(
             os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
-default_cache_path = os.path.join(torch_cache_home, 'pytorch_pretrained_bert')
+default_cache_path = os.path.join(torch_cache_home, 'pytorch_transformers')
 
 try:
     from urllib.parse import urlparse
diff --git a/pytorch_pretrained_bert/model_utils.py b/pytorch_transformers/model_utils.py
similarity index 100%
rename from pytorch_pretrained_bert/model_utils.py
rename to pytorch_transformers/model_utils.py
diff --git a/pytorch_pretrained_bert/modeling_bert.py b/pytorch_transformers/modeling_bert.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_bert.py
rename to pytorch_transformers/modeling_bert.py
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_gpt2.py
rename to pytorch_transformers/modeling_gpt2.py
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_transformers/modeling_openai.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_openai.py
rename to pytorch_transformers/modeling_openai.py
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_transfo_xl.py
rename to pytorch_transformers/modeling_transfo_xl.py
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py b/pytorch_transformers/modeling_transfo_xl_utilities.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
rename to pytorch_transformers/modeling_transfo_xl_utilities.py
diff --git a/pytorch_pretrained_bert/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
similarity index 99%
rename from pytorch_pretrained_bert/modeling_xlm.py
rename to pytorch_transformers/modeling_xlm.py
index 9d1775161d..6decba3cce 100644
--- a/pytorch_pretrained_bert/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -204,7 +204,7 @@ def gelu(x):
     GELU activation
     https://arxiv.org/abs/1606.08415
     https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14
-    https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/modeling.py
+    https://github.com/huggingface/pytorch-transformers/blob/master/modeling.py
     """
     # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
     return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_xlnet.py
rename to pytorch_transformers/modeling_xlnet.py
diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_transformers/optimization.py
similarity index 100%
rename from pytorch_pretrained_bert/optimization.py
rename to pytorch_transformers/optimization.py
diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_transformers/optimization_openai.py
similarity index 100%
rename from pytorch_pretrained_bert/optimization_openai.py
rename to pytorch_transformers/optimization_openai.py
diff --git a/pytorch_pretrained_bert/tests/__init__.py b/pytorch_transformers/tests/__init__.py
similarity index 100%
rename from pytorch_pretrained_bert/tests/__init__.py
rename to pytorch_transformers/tests/__init__.py
diff --git a/pytorch_pretrained_bert/tests/conftest.py b/pytorch_transformers/tests/conftest.py
similarity index 100%
rename from pytorch_pretrained_bert/tests/conftest.py
rename to pytorch_transformers/tests/conftest.py
diff --git a/pytorch_pretrained_bert/tests/fixtures/input.txt b/pytorch_transformers/tests/fixtures/input.txt
similarity index 100%
rename from pytorch_pretrained_bert/tests/fixtures/input.txt
rename to pytorch_transformers/tests/fixtures/input.txt
diff --git a/pytorch_pretrained_bert/tests/fixtures/sample_text.txt b/pytorch_transformers/tests/fixtures/sample_text.txt
similarity index 100%
rename from pytorch_pretrained_bert/tests/fixtures/sample_text.txt
rename to pytorch_transformers/tests/fixtures/sample_text.txt
diff --git a/pytorch_pretrained_bert/tests/fixtures/test_sentencepiece.model b/pytorch_transformers/tests/fixtures/test_sentencepiece.model
similarity index 100%
rename from pytorch_pretrained_bert/tests/fixtures/test_sentencepiece.model
rename to pytorch_transformers/tests/fixtures/test_sentencepiece.model
diff --git a/pytorch_pretrained_bert/tests/model_tests_commons.py b/pytorch_transformers/tests/model_tests_commons.py
similarity index 99%
rename from pytorch_pretrained_bert/tests/model_tests_commons.py
rename to pytorch_transformers/tests/model_tests_commons.py
index e7c97a0787..b831f85552 100644
--- a/pytorch_pretrained_bert/tests/model_tests_commons.py
+++ b/pytorch_transformers/tests/model_tests_commons.py
@@ -412,7 +412,7 @@ class GPTModelTester(object):
             [[], []])
 
     def create_and_check_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(self.base_model_class.PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/model_utils_test.py b/pytorch_transformers/tests/model_utils_test.py
similarity index 89%
rename from pytorch_pretrained_bert/tests/model_utils_test.py
rename to pytorch_transformers/tests/model_utils_test.py
index 59f076fa00..120df35f82 100644
--- a/pytorch_pretrained_bert/tests/model_utils_test.py
+++ b/pytorch_transformers/tests/model_utils_test.py
@@ -25,8 +25,8 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import PretrainedConfig, PreTrainedModel
-from pytorch_pretrained_bert.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP, PRETRAINED_CONFIG_ARCHIVE_MAP
+from pytorch_transformers import PretrainedConfig, PreTrainedModel
+from pytorch_transformers.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP, PRETRAINED_CONFIG_ARCHIVE_MAP
 
 
 class ModelUtilsTest(unittest.TestCase):
diff --git a/pytorch_pretrained_bert/tests/modeling_bert_test.py b/pytorch_transformers/tests/modeling_bert_test.py
similarity index 98%
rename from pytorch_pretrained_bert/tests/modeling_bert_test.py
rename to pytorch_transformers/tests/modeling_bert_test.py
index 7a9d49fde7..b140f5e647 100644
--- a/pytorch_pretrained_bert/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -20,11 +20,11 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
+from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
                                      BertForNextSentencePrediction, BertForPreTraining,
                                      BertForQuestionAnswering, BertForSequenceClassification,
                                      BertForTokenClassification, BertForMultipleChoice)
-from pytorch_pretrained_bert.modeling_bert import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_bert import PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .model_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
 
@@ -266,7 +266,7 @@ class BertModelTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py
similarity index 96%
rename from pytorch_pretrained_bert/tests/modeling_gpt2_test.py
rename to pytorch_transformers/tests/modeling_gpt2_test.py
index 122cdf3c7b..4ace52571a 100644
--- a/pytorch_pretrained_bert/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -25,7 +25,7 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import (GPT2Config, GPT2Model,
+from pytorch_transformers import (GPT2Config, GPT2Model,
                                      GPT2LMHeadModel, GPT2DoubleHeadsModel)
 
 from .model_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
diff --git a/pytorch_pretrained_bert/tests/modeling_openai_test.py b/pytorch_transformers/tests/modeling_openai_test.py
similarity index 96%
rename from pytorch_pretrained_bert/tests/modeling_openai_test.py
rename to pytorch_transformers/tests/modeling_openai_test.py
index 627bc564de..fe81157023 100644
--- a/pytorch_pretrained_bert/tests/modeling_openai_test.py
+++ b/pytorch_transformers/tests/modeling_openai_test.py
@@ -21,7 +21,7 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,
+from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
                                      OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
 
 from .model_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
diff --git a/pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py b/pytorch_transformers/tests/modeling_transfo_xl_test.py
similarity index 97%
rename from pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py
rename to pytorch_transformers/tests/modeling_transfo_xl_test.py
index caeb25b412..d15a19eb64 100644
--- a/pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -25,8 +25,8 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
-from pytorch_pretrained_bert.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
+from pytorch_transformers.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .model_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
 
@@ -184,7 +184,7 @@ class TransfoXLModelTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py
similarity index 97%
rename from pytorch_pretrained_bert/tests/modeling_xlm_test.py
rename to pytorch_transformers/tests/modeling_xlm_test.py
index 3e442a09fb..8a8905cc31 100644
--- a/pytorch_pretrained_bert/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -20,8 +20,8 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_pretrained_bert import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
-from pytorch_pretrained_bert.modeling_xlm import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
+from pytorch_transformers.modeling_xlm import PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .model_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
 
@@ -250,7 +250,7 @@ class XLMModelTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py
similarity index 97%
rename from pytorch_pretrained_bert/tests/modeling_xlnet_test.py
rename to pytorch_transformers/tests/modeling_xlnet_test.py
index 58617cf7b9..b9d55a26c7 100644
--- a/pytorch_pretrained_bert/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -25,8 +25,8 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
-from pytorch_pretrained_bert.modeling_xlnet import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
+from pytorch_transformers.modeling_xlnet import PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .model_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
 
@@ -278,7 +278,7 @@ class XLNetModelTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = XLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/optimization_test.py b/pytorch_transformers/tests/optimization_test.py
similarity index 94%
rename from pytorch_pretrained_bert/tests/optimization_test.py
rename to pytorch_transformers/tests/optimization_test.py
index c6924bd4bc..dfbbd44b6e 100644
--- a/pytorch_pretrained_bert/tests/optimization_test.py
+++ b/pytorch_transformers/tests/optimization_test.py
@@ -20,9 +20,9 @@ import unittest
 
 import torch
 
-from pytorch_pretrained_bert import BertAdam
-from pytorch_pretrained_bert import OpenAIAdam
-from pytorch_pretrained_bert.optimization import ConstantLR, WarmupLinearSchedule, WarmupConstantSchedule, \
+from pytorch_transformers import BertAdam
+from pytorch_transformers import OpenAIAdam
+from pytorch_transformers.optimization import ConstantLR, WarmupLinearSchedule, WarmupConstantSchedule, \
     WarmupCosineWithWarmupRestartsSchedule, WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule
 import numpy as np
 
diff --git a/pytorch_pretrained_bert/tests/tokenization_bert_test.py b/pytorch_transformers/tests/tokenization_bert_test.py
similarity index 97%
rename from pytorch_pretrained_bert/tests/tokenization_bert_test.py
rename to pytorch_transformers/tests/tokenization_bert_test.py
index 3d0b4323b2..59a87a4cb9 100644
--- a/pytorch_pretrained_bert/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -20,7 +20,7 @@ from io import open
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_bert import (BasicTokenizer,
+from pytorch_transformers.tokenization_bert import (BasicTokenizer,
                                                   BertTokenizer,
                                                   WordpieceTokenizer,
                                                   _is_control, _is_punctuation,
@@ -51,7 +51,7 @@ class TokenizationTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
similarity index 94%
rename from pytorch_pretrained_bert/tests/tokenization_gpt2_test.py
rename to pytorch_transformers/tests/tokenization_gpt2_test.py
index 70f69a1f23..c6d926bdd4 100644
--- a/pytorch_pretrained_bert/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -20,7 +20,7 @@ import json
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
 from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -58,7 +58,7 @@ class GPT2TokenizationTest(unittest.TestCase):
 
     # @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/tokenization_openai_test.py b/pytorch_transformers/tests/tokenization_openai_test.py
similarity index 94%
rename from pytorch_pretrained_bert/tests/tokenization_openai_test.py
rename to pytorch_transformers/tests/tokenization_openai_test.py
index 6ae72858a7..38315f927b 100644
--- a/pytorch_pretrained_bert/tests/tokenization_openai_test.py
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -20,7 +20,7 @@ import json
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -60,7 +60,7 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
similarity index 100%
rename from pytorch_pretrained_bert/tests/tokenization_tests_commons.py
rename to pytorch_transformers/tests/tokenization_tests_commons.py
diff --git a/pytorch_pretrained_bert/tests/tokenization_transfo_xl_test.py b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
similarity index 93%
rename from pytorch_pretrained_bert/tests/tokenization_transfo_xl_test.py
rename to pytorch_transformers/tests/tokenization_transfo_xl_test.py
index a5ff30ab6e..f744e319c8 100644
--- a/pytorch_pretrained_bert/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -20,7 +20,7 @@ from io import open
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -61,7 +61,7 @@ class TransfoXLTokenizationTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = TransfoXLTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/tokenization_xlm_test.py b/pytorch_transformers/tests/tokenization_xlm_test.py
similarity index 94%
rename from pytorch_pretrained_bert/tests/tokenization_xlm_test.py
rename to pytorch_transformers/tests/tokenization_xlm_test.py
index 3b2db8ea1f..9cc18f3d60 100644
--- a/pytorch_pretrained_bert/tests/tokenization_xlm_test.py
+++ b/pytorch_transformers/tests/tokenization_xlm_test.py
@@ -20,7 +20,7 @@ import json
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_xlm import XLMTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_xlm import XLMTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -59,7 +59,7 @@ class XLMTokenizationTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = XLMTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/tokenization_xlnet_test.py b/pytorch_transformers/tests/tokenization_xlnet_test.py
similarity index 97%
rename from pytorch_pretrained_bert/tests/tokenization_xlnet_test.py
rename to pytorch_transformers/tests/tokenization_xlnet_test.py
index 9b6dd5a6c4..4dd76e114b 100644
--- a/pytorch_pretrained_bert/tests/tokenization_xlnet_test.py
+++ b/pytorch_transformers/tests/tokenization_xlnet_test.py
@@ -19,7 +19,7 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_xlnet import (XLNetTokenizer,
+from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer,
                                                         PRETRAINED_VOCAB_ARCHIVE_MAP,
                                                         SPIECE_UNDERLINE)
 
@@ -62,7 +62,7 @@ class XLNetTokenizationTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = XLNetTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_bert.py
rename to pytorch_transformers/tokenization_bert.py
diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_gpt2.py
rename to pytorch_transformers/tokenization_gpt2.py
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_openai.py
rename to pytorch_transformers/tokenization_openai.py
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_transfo_xl.py
rename to pytorch_transformers/tokenization_transfo_xl.py
diff --git a/pytorch_pretrained_bert/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_xlm.py
rename to pytorch_transformers/tokenization_xlm.py
diff --git a/pytorch_pretrained_bert/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_xlnet.py
rename to pytorch_transformers/tokenization_xlnet.py
diff --git a/setup.py b/setup.py
index 28e85a0068..09b8c01ad5 100644
--- a/setup.py
+++ b/setup.py
@@ -37,16 +37,16 @@ from io import open
 from setuptools import find_packages, setup
 
 setup(
-    name="pytorch_pretrained_bert",
-    version="0.6.2",
-    author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
+    name="pytorch_transformers",
+    version="0.7.0",
+    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
     author_email="thomas@huggingface.co",
     description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
     long_description=open("README.md", "r", encoding='utf-8').read(),
     long_description_content_type="text/markdown",
     keywords='BERT NLP deep learning google',
     license='Apache',
-    url="https://github.com/huggingface/pytorch-pretrained-BERT",
+    url="https://github.com/huggingface/pytorch-transformers",
     packages=find_packages(exclude=["*.tests", "*.tests.*",
                                     "tests.*", "tests"]),
     install_requires=['torch>=0.4.1',
@@ -58,7 +58,7 @@ setup(
                       'sentencepiece'],
     entry_points={
       'console_scripts': [
-        "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main",
+        "pytorch_transformers=pytorch_transformers.__main__:main",
       ]
     },
     # python_requires='>=3.5.0',