From 0bab55d5d52e4d538888980d05d73acc6da6274a Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 11:55:36 +0200
Subject: [PATCH 01/17] [BIG] name change

---
 .circleci/config.yml                          |   8 +-
 .coveragerc                                   |   2 +-
 README.md                                     | 112 +++++++++---------
 docker/Dockerfile                             |   2 +-
 examples/bertology.py                         |   2 +-
 examples/generation_xlnet.py                  |   2 +-
 .../lm_finetuning/finetune_on_pregenerated.py |   8 +-
 .../pregenerate_training_data.py              |   2 +-
 .../lm_finetuning/simple_lm_finetuning.py     |   8 +-
 examples/run_bert_classifier.py               |   8 +-
 examples/run_bert_extract_features.py         |   4 +-
 examples/run_bert_squad.py                    |   8 +-
 examples/run_bert_swag.py                     |   8 +-
 examples/run_gpt2.py                          |   2 +-
 examples/run_openai_gpt.py                    |   2 +-
 examples/run_transfo_xl.py                    |   2 +-
 examples/run_xlnet_classifier.py              |   8 +-
 examples/run_xlnet_squad.py                   |   8 +-
 examples/tests/examples_tests.py              |  50 ++++++++
 examples/utils_squad.py                       |   2 +-
 hubconfs/bert_hubconf.py                      |  38 +++---
 hubconfs/gpt2_hubconf.py                      |  18 +--
 hubconfs/gpt_hubconf.py                       |  18 +--
 hubconfs/transformer_xl_hubconf.py            |  14 +--
 hubconfs/xlm_hubconf.py                       |  16 +--
 hubconfs/xlnet_hubconf.1.py                   |  18 +--
 .../Comparing-TF-and-PT-models-MLM-NSP.ipynb  |   8 +-
 notebooks/Comparing-TF-and-PT-models.ipynb    |   6 +-
 .../__init__.py                               |   2 +-
 .../__main__.py                               |  28 ++---
 .../convert_gpt2_checkpoint_to_pytorch.py     |   2 +-
 .../convert_openai_checkpoint_to_pytorch.py   |   2 +-
 .../convert_tf_checkpoint_to_pytorch.py       |   2 +-
 ...onvert_transfo_xl_checkpoint_to_pytorch.py |   6 +-
 .../convert_xlm_checkpoint_to_pytorch.py      |   4 +-
 .../convert_xlnet_checkpoint_to_pytorch.py    |   2 +-
 .../file_utils.py                             |   2 +-
 .../model_utils.py                            |   0
 .../modeling_bert.py                          |   0
 .../modeling_gpt2.py                          |   0
 .../modeling_openai.py                        |   0
 .../modeling_transfo_xl.py                    |   0
 .../modeling_transfo_xl_utilities.py          |   0
 .../modeling_xlm.py                           |   2 +-
 .../modeling_xlnet.py                         |   0
 .../optimization.py                           |   0
 .../optimization_openai.py                    |   0
 .../tests/__init__.py                         |   0
 .../tests/conftest.py                         |   0
 .../tests/fixtures/input.txt                  |   0
 .../tests/fixtures/sample_text.txt            |   0
 .../tests/fixtures/test_sentencepiece.model   | Bin
 .../tests/model_tests_commons.py              |   2 +-
 .../tests/model_utils_test.py                 |   4 +-
 .../tests/modeling_bert_test.py               |   6 +-
 .../tests/modeling_gpt2_test.py               |   2 +-
 .../tests/modeling_openai_test.py             |   2 +-
 .../tests/modeling_transfo_xl_test.py         |   6 +-
 .../tests/modeling_xlm_test.py                |   6 +-
 .../tests/modeling_xlnet_test.py              |   6 +-
 .../tests/optimization_test.py                |   6 +-
 .../tests/tokenization_bert_test.py           |   4 +-
 .../tests/tokenization_gpt2_test.py           |   4 +-
 .../tests/tokenization_openai_test.py         |   4 +-
 .../tests/tokenization_tests_commons.py       |   0
 .../tests/tokenization_transfo_xl_test.py     |   4 +-
 .../tests/tokenization_xlm_test.py            |   4 +-
 .../tests/tokenization_xlnet_test.py          |   4 +-
 .../tokenization_bert.py                      |   0
 .../tokenization_gpt2.py                      |   0
 .../tokenization_openai.py                    |   0
 .../tokenization_transfo_xl.py                |   0
 .../tokenization_xlm.py                       |   0
 .../tokenization_xlnet.py                     |   0
 setup.py                                      |  10 +-
 75 files changed, 280 insertions(+), 230 deletions(-)
 create mode 100644 examples/tests/examples_tests.py
 rename {pytorch_pretrained_bert => pytorch_transformers}/__init__.py (98%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/__main__.py (72%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/convert_gpt2_checkpoint_to_pytorch.py (97%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/convert_openai_checkpoint_to_pytorch.py (97%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/convert_tf_checkpoint_to_pytorch.py (95%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/convert_transfo_xl_checkpoint_to_pytorch.py (96%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/convert_xlm_checkpoint_to_pytorch.py (93%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/convert_xlnet_checkpoint_to_pytorch.py (98%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/file_utils.py (99%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/model_utils.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/modeling_bert.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/modeling_gpt2.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/modeling_openai.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/modeling_transfo_xl.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/modeling_transfo_xl_utilities.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/modeling_xlm.py (99%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/modeling_xlnet.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/optimization.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/optimization_openai.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/__init__.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/conftest.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/fixtures/input.txt (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/fixtures/sample_text.txt (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/fixtures/test_sentencepiece.model (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/model_tests_commons.py (99%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/model_utils_test.py (89%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/modeling_bert_test.py (98%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/modeling_gpt2_test.py (96%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/modeling_openai_test.py (96%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/modeling_transfo_xl_test.py (97%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/modeling_xlm_test.py (97%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/modeling_xlnet_test.py (97%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/optimization_test.py (94%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/tokenization_bert_test.py (97%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/tokenization_gpt2_test.py (94%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/tokenization_openai_test.py (94%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/tokenization_tests_commons.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/tokenization_transfo_xl_test.py (93%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/tokenization_xlm_test.py (94%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tests/tokenization_xlnet_test.py (97%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tokenization_bert.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tokenization_gpt2.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tokenization_openai.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tokenization_transfo_xl.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tokenization_xlm.py (100%)
 rename {pytorch_pretrained_bert => pytorch_transformers}/tokenization_xlnet.py (100%)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 858ca001d6..ac23723f98 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,7 +1,7 @@
 version: 2
 jobs:
     build_py3:
-        working_directory: ~/pytorch-pretrained-BERT
+        working_directory: ~/pytorch-transformers
         docker:
             - image: circleci/python:3.5
         steps:
@@ -10,11 +10,11 @@ jobs:
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
-            - run: python -m pytest -sv ./pytorch_pretrained_bert/tests/ --cov
+            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
             - run: codecov
         parallelism: 4
     build_py2:
-        working_directory: ~/pytorch-pretrained-BERT
+        working_directory: ~/pytorch-transformers
         docker:
             - image: circleci/python:2.7
         steps:
@@ -23,7 +23,7 @@ jobs:
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install spacy ftfy==4.4.3
             - run: sudo python -m spacy download en
-            - run: python -m pytest -sv ./pytorch_pretrained_bert/tests/ --cov
+            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
             - run: codecov
         parallelism: 4
 workflows:
diff --git a/.coveragerc b/.coveragerc
index fe05dda9a8..9b8c40ecf1 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,5 @@
 [run]
-source=pytorch_pretrained_bert
+source=pytorch_transformers
 [report]
 exclude_lines =
     pragma: no cover
diff --git a/README.md b/README.md
index a5234bd9ba..b1e80edc89 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # PyTorch Pretrained BERT: The Big & Extending Repository of pretrained Transformers
 
-[![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT)
+[![CircleCI](https://circleci.com/gh/huggingface/pytorch-transformers.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-transformers)
 
 This repository contains op-for-op PyTorch reimplementations, pre-trained models and fine-tuning examples for:
 
@@ -47,7 +47,7 @@ This repo was tested on Python 2.7 and 3.5+ (examples are tested only on python
 
 PyTorch pretrained bert can be installed by pip as follows:
 ```bash
-pip install pytorch-pretrained-bert
+pip install pytorch-transformers
 ```
 
 If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (limit to version 4.4.3 if you are using Python 2) and `SpaCy` :
@@ -73,7 +73,7 @@ python -m spacy download en
 
 Again, if you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage).
 
-A series of tests is included in the [tests folder](https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/tests) and can be run using `pytest` (install pytest if needed: `pip install pytest`).
+A series of tests is included in the [tests folder](https://github.com/huggingface/pytorch-transformers/tree/master/tests) and can be run using `pytest` (install pytest if needed: `pip install pytest`).
 
 You can run the tests with the command:
 ```bash
@@ -84,51 +84,51 @@ python -m pytest -sv tests/
 
 This package comprises the following classes that can be imported in Python and are detailed in the [Doc](#doc) section of this readme:
 
-- Eight **Bert** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling.py`](./pytorch_pretrained_bert/modeling.py) file):
-  - [`BertModel`](./pytorch_pretrained_bert/modeling.py#L639) - raw BERT Transformer model (**fully pre-trained**),
-  - [`BertForMaskedLM`](./pytorch_pretrained_bert/modeling.py#L793) - BERT Transformer with the pre-trained masked language modeling head on top (**fully pre-trained**),
-  - [`BertForNextSentencePrediction`](./pytorch_pretrained_bert/modeling.py#L854) - BERT Transformer with the pre-trained next sentence prediction classifier on top  (**fully pre-trained**),
-  - [`BertForPreTraining`](./pytorch_pretrained_bert/modeling.py#L722) - BERT Transformer with masked language modeling head and next sentence prediction classifier on top (**fully pre-trained**),
-  - [`BertForSequenceClassification`](./pytorch_pretrained_bert/modeling.py#L916) - BERT Transformer with a sequence classification head on top (BERT Transformer is **pre-trained**, the sequence classification head **is only initialized and has to be trained**),
-  - [`BertForMultipleChoice`](./pytorch_pretrained_bert/modeling.py#L982) - BERT Transformer with a multiple choice head on top (used for task like Swag) (BERT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
-  - [`BertForTokenClassification`](./pytorch_pretrained_bert/modeling.py#L1051) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**),
-  - [`BertForQuestionAnswering`](./pytorch_pretrained_bert/modeling.py#L1124) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**).
+- Eight **Bert** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling.py`](./pytorch_transformers/modeling.py) file):
+  - [`BertModel`](./pytorch_transformers/modeling.py#L639) - raw BERT Transformer model (**fully pre-trained**),
+  - [`BertForMaskedLM`](./pytorch_transformers/modeling.py#L793) - BERT Transformer with the pre-trained masked language modeling head on top (**fully pre-trained**),
+  - [`BertForNextSentencePrediction`](./pytorch_transformers/modeling.py#L854) - BERT Transformer with the pre-trained next sentence prediction classifier on top  (**fully pre-trained**),
+  - [`BertForPreTraining`](./pytorch_transformers/modeling.py#L722) - BERT Transformer with masked language modeling head and next sentence prediction classifier on top (**fully pre-trained**),
+  - [`BertForSequenceClassification`](./pytorch_transformers/modeling.py#L916) - BERT Transformer with a sequence classification head on top (BERT Transformer is **pre-trained**, the sequence classification head **is only initialized and has to be trained**),
+  - [`BertForMultipleChoice`](./pytorch_transformers/modeling.py#L982) - BERT Transformer with a multiple choice head on top (used for task like Swag) (BERT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
+  - [`BertForTokenClassification`](./pytorch_transformers/modeling.py#L1051) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**),
+  - [`BertForQuestionAnswering`](./pytorch_transformers/modeling.py#L1124) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**).
 
-- Three **OpenAI GPT** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py) file):
-  - [`OpenAIGPTModel`](./pytorch_pretrained_bert/modeling_openai.py#L536) - raw OpenAI GPT Transformer model (**fully pre-trained**),
-  - [`OpenAIGPTLMHeadModel`](./pytorch_pretrained_bert/modeling_openai.py#L643) - OpenAI GPT Transformer with the tied language modeling head on top (**fully pre-trained**),
-  - [`OpenAIGPTDoubleHeadsModel`](./pytorch_pretrained_bert/modeling_openai.py#L722) - OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
+- Three **OpenAI GPT** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_openai.py`](./pytorch_transformers/modeling_openai.py) file):
+  - [`OpenAIGPTModel`](./pytorch_transformers/modeling_openai.py#L536) - raw OpenAI GPT Transformer model (**fully pre-trained**),
+  - [`OpenAIGPTLMHeadModel`](./pytorch_transformers/modeling_openai.py#L643) - OpenAI GPT Transformer with the tied language modeling head on top (**fully pre-trained**),
+  - [`OpenAIGPTDoubleHeadsModel`](./pytorch_transformers/modeling_openai.py#L722) - OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
 
-- Two **Transformer-XL** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py) file):
-  - [`TransfoXLModel`](./pytorch_pretrained_bert/modeling_transfo_xl.py#L983) - Transformer-XL model which outputs the last hidden state and memory cells (**fully pre-trained**),
-  - [`TransfoXLLMHeadModel`](./pytorch_pretrained_bert/modeling_transfo_xl.py#L1260) - Transformer-XL with the tied adaptive softmax head on top for language modeling which outputs the logits/loss and memory cells (**fully pre-trained**),
+- Two **Transformer-XL** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_transfo_xl.py`](./pytorch_transformers/modeling_transfo_xl.py) file):
+  - [`TransfoXLModel`](./pytorch_transformers/modeling_transfo_xl.py#L983) - Transformer-XL model which outputs the last hidden state and memory cells (**fully pre-trained**),
+  - [`TransfoXLLMHeadModel`](./pytorch_transformers/modeling_transfo_xl.py#L1260) - Transformer-XL with the tied adaptive softmax head on top for language modeling which outputs the logits/loss and memory cells (**fully pre-trained**),
 
-- Three **OpenAI GPT-2** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_gpt2.py`](./pytorch_pretrained_bert/modeling_gpt2.py) file):
-  - [`GPT2Model`](./pytorch_pretrained_bert/modeling_gpt2.py#L479) - raw OpenAI GPT-2 Transformer model (**fully pre-trained**),
-  - [`GPT2LMHeadModel`](./pytorch_pretrained_bert/modeling_gpt2.py#L559) - OpenAI GPT-2 Transformer with the tied language modeling head on top (**fully pre-trained**),
-  - [`GPT2DoubleHeadsModel`](./pytorch_pretrained_bert/modeling_gpt2.py#L624) - OpenAI GPT-2 Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT-2 Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
+- Three **OpenAI GPT-2** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_gpt2.py`](./pytorch_transformers/modeling_gpt2.py) file):
+  - [`GPT2Model`](./pytorch_transformers/modeling_gpt2.py#L479) - raw OpenAI GPT-2 Transformer model (**fully pre-trained**),
+  - [`GPT2LMHeadModel`](./pytorch_transformers/modeling_gpt2.py#L559) - OpenAI GPT-2 Transformer with the tied language modeling head on top (**fully pre-trained**),
+  - [`GPT2DoubleHeadsModel`](./pytorch_transformers/modeling_gpt2.py#L624) - OpenAI GPT-2 Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT-2 Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
 
-- Tokenizers for **BERT** (using word-piece) (in the [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) file):
+- Tokenizers for **BERT** (using word-piece) (in the [`tokenization.py`](./pytorch_transformers/tokenization.py) file):
   - `BasicTokenizer` - basic tokenization (punctuation splitting, lower casing, etc.),
   - `WordpieceTokenizer` - WordPiece tokenization,
   - `BertTokenizer` - perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
 
-- Tokenizer for **OpenAI GPT** (using Byte-Pair-Encoding) (in the [`tokenization_openai.py`](./pytorch_pretrained_bert/tokenization_openai.py) file):
+- Tokenizer for **OpenAI GPT** (using Byte-Pair-Encoding) (in the [`tokenization_openai.py`](./pytorch_transformers/tokenization_openai.py) file):
   - `OpenAIGPTTokenizer` - perform Byte-Pair-Encoding (BPE) tokenization.
 
-- Tokenizer for **Transformer-XL** (word tokens ordered by frequency for adaptive softmax) (in the [`tokenization_transfo_xl.py`](./pytorch_pretrained_bert/tokenization_transfo_xl.py) file):
+- Tokenizer for **Transformer-XL** (word tokens ordered by frequency for adaptive softmax) (in the [`tokenization_transfo_xl.py`](./pytorch_transformers/tokenization_transfo_xl.py) file):
   - `OpenAIGPTTokenizer` - perform word tokenization and can order words by frequency in a corpus for use in an adaptive softmax.
 
-- Tokenizer for **OpenAI GPT-2** (using byte-level Byte-Pair-Encoding) (in the [`tokenization_gpt2.py`](./pytorch_pretrained_bert/tokenization_gpt2.py) file):
+- Tokenizer for **OpenAI GPT-2** (using byte-level Byte-Pair-Encoding) (in the [`tokenization_gpt2.py`](./pytorch_transformers/tokenization_gpt2.py) file):
   - `GPT2Tokenizer` - perform byte-level Byte-Pair-Encoding (BPE) tokenization.
 
-- Optimizer for **BERT** (in the [`optimization.py`](./pytorch_pretrained_bert/optimization.py) file):
+- Optimizer for **BERT** (in the [`optimization.py`](./pytorch_transformers/optimization.py) file):
   - `BertAdam` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
-- Optimizer for **OpenAI GPT** (in the [`optimization_openai.py`](./pytorch_pretrained_bert/optimization_openai.py) file):
+- Optimizer for **OpenAI GPT** (in the [`optimization_openai.py`](./pytorch_transformers/optimization_openai.py) file):
   - `OpenAIAdam` - OpenAI GPT version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
-- Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective [`modeling.py`](./pytorch_pretrained_bert/modeling.py), [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py), [`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py) files):
+- Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective [`modeling.py`](./pytorch_transformers/modeling.py), [`modeling_openai.py`](./pytorch_transformers/modeling_openai.py), [`modeling_transfo_xl.py`](./pytorch_transformers/modeling_transfo_xl.py) files):
   - `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilities to read and write from JSON configuration files.
   - `OpenAIGPTConfig` - Configuration class to store the configuration of a `OpenAIGPTModel` with utilities to read and write from JSON configuration files.
   - `GPT2Config` - Configuration class to store the configuration of a `GPT2Model` with utilities to read and write from JSON configuration files.
@@ -175,7 +175,7 @@ First let's prepare a tokenized input with `BertTokenizer`
 
 ```python
 import torch
-from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
+from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
 
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
@@ -252,7 +252,7 @@ First let's prepare a tokenized input with `OpenAIGPTTokenizer`
 
 ```python
 import torch
-from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
+from pytorch_transformers import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
 
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
@@ -339,7 +339,7 @@ First let's prepare a tokenized input with `TransfoXLTokenizer`
 
 ```python
 import torch
-from pytorch_pretrained_bert import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel
+from pytorch_transformers import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel
 
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
@@ -414,7 +414,7 @@ First let's prepare a tokenized input with `GPT2Tokenizer`
 
 ```python
 import torch
-from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
+from pytorch_transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
 
 # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
 import logging
@@ -552,7 +552,7 @@ where
     - `bert_config.json` or `openai_gpt_config.json` a configuration file for the model, and
     - `pytorch_model.bin` a PyTorch dump of a pre-trained instance of `BertForPreTraining`, `OpenAIGPTModel`, `TransfoXLModel`, `GPT2LMHeadModel` (saved with the usual `torch.save()`)
 
-  If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_pretrained_bert/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_pretrained_bert/`).
+  If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_transformers/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_transformers/`).
 
 - `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information).
 - `from_tf`: should we load the weights from a locally saved TensorFlow checkpoint
@@ -586,19 +586,19 @@ model = GPT2Model.from_pretrained('gpt2')
 
 #### Cache directory
 
-`pytorch_pretrained_bert` save the pretrained weights in a cache directory which is located at (in this order of priority):
+`pytorch_transformers` save the pretrained weights in a cache directory which is located at (in this order of priority):
 
 - `cache_dir` optional arguments to the `from_pretrained()` method (see above),
 - shell environment variable `PYTORCH_PRETRAINED_BERT_CACHE`,
-- PyTorch cache home + `/pytorch_pretrained_bert/`
+- PyTorch cache home + `/pytorch_transformers/`
   where PyTorch cache home is defined by (in this order):
   - shell environment variable `ENV_TORCH_HOME`
   - shell environment variable `ENV_XDG_CACHE_HOME` + `/torch/`)
   - default: `~/.cache/torch/`
 
-Usually, if you don't set any specific environment variable, `pytorch_pretrained_bert` cache will be at `~/.cache/torch/pytorch_pretrained_bert/`.
+Usually, if you don't set any specific environment variable, `pytorch_transformers` cache will be at `~/.cache/torch/pytorch_transformers/`.
 
-You can alsways safely delete `pytorch_pretrained_bert` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
+You can alsways safely delete `pytorch_transformers` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
 
 ### Serialization best-practices
 
@@ -621,7 +621,7 @@ The *default filenames* of these files are as follow:
 Here is the recommended way of saving the model, configuration and vocabulary to an `output_dir` directory and reloading the model and tokenizer afterwards:
 
 ```python
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
 
 output_dir = "./models/"
 
@@ -719,7 +719,7 @@ The model can be instantiated with the following arguments:
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.
 
 We detail them here. This model takes as *inputs*:
-[`modeling.py`](./pytorch_pretrained_bert/modeling.py)
+[`modeling.py`](./pytorch_transformers/modeling.py)
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary (see the tokens preprocessing logic in the scripts [`run_bert_extract_features.py`](./examples/run_bert_extract_features.py), [`run_bert_classifier.py`](./examples/run_bert_classifier.py) and [`run_bert_squad.py`](./examples/run_bert_squad.py)), and
 - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
 - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. It's a mask to be used if some input sequence lengths are smaller than the max input sequence length of the current batch. It's the mask that we typically use for attention when a batch has varying length sentences.
@@ -852,7 +852,7 @@ The model can be instantiated with the following arguments:
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.
 
 We detail them here. This model takes as *inputs*:
-[`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py)
+[`modeling_openai.py`](./pytorch_transformers/modeling_openai.py)
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
 - `position_ids`: an optional torch.LongTensor with the same shape as input_ids
     with the position indices (selected in the range [0, config.n_positions - 1[.
@@ -905,7 +905,7 @@ Transformer XL use a relative positioning with sinusiodal patterns and adaptive
 - the tokens in the vocabulary have to be sorted to decreasing frequency.
 
 This model takes as *inputs*:
-[`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py)
+[`modeling_transfo_xl.py`](./pytorch_transformers/modeling_transfo_xl.py)
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the token indices selected in the range [0, self.config.n_token[
 - `mems`: an optional memory of hidden states from previous forward passes as a list (num layers) of hidden states at the entry of each layer. Each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
 
@@ -952,7 +952,7 @@ The model can be instantiated with the following arguments:
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.
 
 We detail them here. This model takes as *inputs*:
-[`modeling_gpt2.py`](./pytorch_pretrained_bert/modeling_gpt2.py)
+[`modeling_gpt2.py`](./pytorch_transformers/modeling_gpt2.py)
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, vocab_size[
 - `position_ids`: an optional torch.LongTensor with the same shape as input_ids
     with the position indices (selected in the range [0, config.n_positions - 1[.
@@ -1020,7 +1020,7 @@ and three methods:
 - `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
 - `save_vocabulary(directory_path)`: save the vocabulary file to `directory_path`. Return the path to the saved vocabulary file: `vocab_file_path`. The vocabulary can be reloaded with `BertTokenizer.from_pretrained('vocab_file_path')` or `BertTokenizer.from_pretrained('directory_path')`.
 
-Please refer to the doc strings and code in [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) for the details of the `BasicTokenizer` and `WordpieceTokenizer` classes. In general it is recommended to use `BertTokenizer` unless you know what you are doing.
+Please refer to the doc strings and code in [`tokenization.py`](./pytorch_transformers/tokenization.py) for the details of the `BasicTokenizer` and `WordpieceTokenizer` classes. In general it is recommended to use `BertTokenizer` unless you know what you are doing.
 
 #### `OpenAIGPTTokenizer`
 
@@ -1043,7 +1043,7 @@ and five methods:
 - `decode(ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)`: decode a list of `int` indices in a string and do some post-processing if needed: (i) remove special tokens from the output and (ii) clean up tokenization spaces.
 - `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`.
 
-Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch_pretrained_bert/tokenization_openai.py) for the details of the `OpenAIGPTTokenizer`.
+Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch_transformers/tokenization_openai.py) for the details of the `OpenAIGPTTokenizer`.
 
 #### `TransfoXLTokenizer`
 
@@ -1051,7 +1051,7 @@ Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch
 
 The API is similar to the API of `BertTokenizer` (see above).
 
-Please refer to the doc strings and code in [`tokenization_transfo_xl.py`](./pytorch_pretrained_bert/tokenization_transfo_xl.py) for the details of these additional methods in `TransfoXLTokenizer`.
+Please refer to the doc strings and code in [`tokenization_transfo_xl.py`](./pytorch_transformers/tokenization_transfo_xl.py) for the details of these additional methods in `TransfoXLTokenizer`.
 
 #### `GPT2Tokenizer`
 
@@ -1073,7 +1073,7 @@ and two methods:
 - `decode(tokens)`: convert back a list of `int` tokens in a `str`.
 - `save_vocabulary(directory_path)`: save the vocabulary, merge and special tokens files to `directory_path`. Return the path to the three files: `vocab_file_path`, `merge_file_path`, `special_tokens_file_path`. The vocabulary can be reloaded with `OpenAIGPTTokenizer.from_pretrained('directory_path')`.
 
-Please refer to [`tokenization_gpt2.py`](./pytorch_pretrained_bert/tokenization_gpt2.py) for more details on the `GPT2Tokenizer`.
+Please refer to [`tokenization_gpt2.py`](./pytorch_transformers/tokenization_gpt2.py) for more details on the `GPT2Tokenizer`.
 
 ### Optimizers
 
@@ -1155,7 +1155,7 @@ Here is how to use these techniques in our scripts:
 - **Distributed training**: Distributed training can be activated by supplying an integer greater or equal to 0 to the `--local_rank` argument (see below).
 - **16-bits training**: 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found [here](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) and a full documentation is [here](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html). In our scripts, this option can be activated by setting the `--fp16` flag and you can play with loss scaling using the `--loss_scale` flag (see the previously linked documentation for details on loss scaling). The loss scale can be zero in which case the scale is dynamically adjusted or a positive power of two in which case the scaling is static.
 
-To use 16-bits training and distributed training, you need to install NVIDIA's apex extension [as detailed here](https://github.com/nvidia/apex). You will find more information regarding the internals of `apex` and how to use `apex` in [the doc and the associated repository](https://github.com/nvidia/apex). The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in [the relevant PR of the present repository](https://github.com/huggingface/pytorch-pretrained-BERT/pull/116).
+To use 16-bits training and distributed training, you need to install NVIDIA's apex extension [as detailed here](https://github.com/nvidia/apex). You will find more information regarding the internals of `apex` and how to use `apex` in [the doc and the associated repository](https://github.com/nvidia/apex). The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in [the relevant PR of the present repository](https://github.com/huggingface/pytorch-transformers/pull/116).
 
 Note: To use *Distributed Training*, you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see [the above mentioned blog post]((https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255)) for more details):
 ```bash
@@ -1660,7 +1660,7 @@ To help you understand and use these features, we have added a specific example
 
 ## Notebooks
 
-We include [three Jupyter Notebooks](https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/notebooks) that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
+We include [three Jupyter Notebooks](https://github.com/huggingface/pytorch-transformers/tree/master/notebooks) that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
 
 - The first NoteBook ([Comparing-TF-and-PT-models.ipynb](./notebooks/Comparing-TF-and-PT-models.ipynb)) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
 
@@ -1676,7 +1676,7 @@ A command-line interface is provided to convert a TensorFlow checkpoint in a PyT
 
 ### BERT
 
-You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`convert_tf_checkpoint_to_pytorch.py`](./pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py ) script.
+You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`convert_tf_checkpoint_to_pytorch.py`](./pytorch_transformers/convert_tf_checkpoint_to_pytorch.py ) script.
 
 This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using `torch.load()` (see examples in [`run_bert_extract_features.py`](./examples/run_bert_extract_features.py), [`run_bert_classifier.py`](./examples/run_bert_classifier.py) and [`run_bert_squad.py`](./examples/run_bert_squad.py)).
 
@@ -1689,7 +1689,7 @@ Here is an example of the conversion process for a pre-trained `BERT-Base Uncase
 ```shell
 export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
 
-pytorch_pretrained_bert bert \
+pytorch_transformers bert \
   $BERT_BASE_DIR/bert_model.ckpt \
   $BERT_BASE_DIR/bert_config.json \
   $BERT_BASE_DIR/pytorch_model.bin
@@ -1704,7 +1704,7 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
 ```shell
 export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
 
-pytorch_pretrained_bert gpt \
+pytorch_transformers gpt \
   $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
   $PYTORCH_DUMP_OUTPUT \
   [OPENAI_GPT_CONFIG]
@@ -1717,7 +1717,7 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
 ```shell
 export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
 
-pytorch_pretrained_bert transfo_xl \
+pytorch_transformers transfo_xl \
   $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
   $PYTORCH_DUMP_OUTPUT \
   [TRANSFO_XL_CONFIG]
@@ -1730,7 +1730,7 @@ Here is an example of the conversion process for a pre-trained OpenAI's GPT-2 mo
 ```shell
 export GPT2_DIR=/path/to/gpt2/checkpoint
 
-pytorch_pretrained_bert gpt2 \
+pytorch_transformers gpt2 \
   $GPT2_DIR/model.ckpt \
   $PYTORCH_DUMP_OUTPUT \
   [GPT2_CONFIG]
@@ -1744,7 +1744,7 @@ Here is an example of the conversion process for a pre-trained XLNet model, fine
 export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
 export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
 
-pytorch_pretrained_bert xlnet \
+pytorch_transformers xlnet \
   $TRANSFO_XL_CHECKPOINT_PATH \
   $TRANSFO_XL_CONFIG_PATH \
   $PYTORCH_DUMP_OUTPUT \
diff --git a/docker/Dockerfile b/docker/Dockerfile
index e47eb548f9..1a6c6f06f9 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -2,6 +2,6 @@ FROM pytorch/pytorch:latest
 
 RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
 
-RUN pip install pytorch-pretrained-bert
+RUN pip install pytorch_transformers
 
 WORKDIR /workspace
\ No newline at end of file
diff --git a/examples/bertology.py b/examples/bertology.py
index 6f7f7c9592..096b1b44fc 100644
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -12,7 +12,7 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse
 from torch.utils.data.distributed import DistributedSampler
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from pytorch_pretrained_bert import BertForSequenceClassification, BertTokenizer
+from pytorch_transformers import BertForSequenceClassification, BertTokenizer
 
 from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
 
diff --git a/examples/generation_xlnet.py b/examples/generation_xlnet.py
index e54f6a365f..fe3610cfd1 100644
--- a/examples/generation_xlnet.py
+++ b/examples/generation_xlnet.py
@@ -1,6 +1,6 @@
 import torch
 from torch.nn import functional as F
-from pytorch_pretrained_bert import XLNetModel, XLNetLMHeadModel, XLNetTokenizer
+from pytorch_transformers import XLNetModel, XLNetLMHeadModel, XLNetTokenizer
 
 import logging
 logging.basicConfig(level=logging.INFO)
diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 8eda2aa5c5..505cd466f6 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -13,10 +13,10 @@ from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_bert import BertForPreTraining
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForPreTraining
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")
 
diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py
index c2211c88e6..b79257fd4b 100644
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -5,7 +5,7 @@ from tempfile import TemporaryDirectory
 import shelve
 
 from random import random, randrange, randint, shuffle, choice
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
+from pytorch_transformers.tokenization_bert import BertTokenizer
 import numpy as np
 import json
 import collections
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index bcfd138442..3008787cd1 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -29,10 +29,10 @@ from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_bert import BertForPreTraining
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForPreTraining
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt='%m/%d/%Y %H:%M:%S',
diff --git a/examples/run_bert_classifier.py b/examples/run_bert_classifier.py
index 233a7ee5d1..506aecc5b1 100644
--- a/examples/run_bert_classifier.py
+++ b/examples/run_bert_classifier.py
@@ -34,10 +34,10 @@ from torch.nn import CrossEntropyLoss, MSELoss
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_bert import BertForSequenceClassification
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForSequenceClassification
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
 
diff --git a/examples/run_bert_extract_features.py b/examples/run_bert_extract_features.py
index 2a550c431a..cc7dedd6af 100644
--- a/examples/run_bert_extract_features.py
+++ b/examples/run_bert_extract_features.py
@@ -28,8 +28,8 @@ import torch
 from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
-from pytorch_pretrained_bert.modeling_bert import BertModel
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.modeling_bert import BertModel
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                     datefmt = '%m/%d/%Y %H:%M:%S',
diff --git a/examples/run_bert_squad.py b/examples/run_bert_squad.py
index f8eee9c8eb..c3fdb06316 100644
--- a/examples/run_bert_squad.py
+++ b/examples/run_bert_squad.py
@@ -33,10 +33,10 @@ from tqdm import tqdm, trange
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_bert import BertForQuestionAnswering
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForQuestionAnswering
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers.tokenization_bert import BertTokenizer
 
 from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
 
diff --git a/examples/run_bert_swag.py b/examples/run_bert_swag.py
index 3e45225891..00cd3a7840 100644
--- a/examples/run_bert_swag.py
+++ b/examples/run_bert_swag.py
@@ -32,10 +32,10 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_bert import BertForMultipleChoice, BertConfig
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
+from pytorch_transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForMultipleChoice, BertConfig
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers.tokenization_bert import BertTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
diff --git a/examples/run_gpt2.py b/examples/run_gpt2.py
index 8f8208bbcd..a759e449f9 100644
--- a/examples/run_gpt2.py
+++ b/examples/run_gpt2.py
@@ -8,7 +8,7 @@ import torch
 import torch.nn.functional as F
 import numpy as np
 
-from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer
+from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index ac5c474491..02b86b3a22 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -39,7 +39,7 @@ import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 
-from pytorch_pretrained_bert import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
+from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
                                      OpenAIAdam, cached_path, WEIGHTS_NAME, CONFIG_NAME)
 
 ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
diff --git a/examples/run_transfo_xl.py b/examples/run_transfo_xl.py
index 0ea7b32053..fda0d8dc28 100644
--- a/examples/run_transfo_xl.py
+++ b/examples/run_transfo_xl.py
@@ -28,7 +28,7 @@ import math
 
 import torch
 
-from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
+from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py
index e30cad773b..7cf8a8d877 100644
--- a/examples/run_xlnet_classifier.py
+++ b/examples/run_xlnet_classifier.py
@@ -34,10 +34,10 @@ from torch.nn import CrossEntropyLoss, MSELoss
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_xlnet import XLNetForSequenceClassification
-from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_xlnet import XLNetForSequenceClassification
+from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
 
diff --git a/examples/run_xlnet_squad.py b/examples/run_xlnet_squad.py
index c299358b79..393fa98abd 100644
--- a/examples/run_xlnet_squad.py
+++ b/examples/run_xlnet_squad.py
@@ -33,10 +33,10 @@ from tqdm import tqdm, trange
 
 from tensorboardX import SummaryWriter
 
-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling_xlnet import BertForQuestionAnswering
-from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_xlnet import BertForQuestionAnswering
+from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
 
diff --git a/examples/tests/examples_tests.py b/examples/tests/examples_tests.py
new file mode 100644
index 0000000000..120df35f82
--- /dev/null
+++ b/examples/tests/examples_tests.py
@@ -0,0 +1,50 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc..
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+import json
+import random
+import shutil
+import pytest
+
+import torch
+
+from pytorch_transformers import PretrainedConfig, PreTrainedModel
+from pytorch_transformers.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP, PRETRAINED_CONFIG_ARCHIVE_MAP
+
+
+class ModelUtilsTest(unittest.TestCase):
+    def test_model_from_pretrained(self):
+        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = BertConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, PretrainedConfig)
+
+            model = BertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, PreTrainedModel)
+
+            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+            self.assertEqual(model.config.output_attentions, True)
+            self.assertEqual(model.config.output_hidden_states, True)
+            self.assertEqual(model.config, config)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/examples/utils_squad.py b/examples/utils_squad.py
index 0dfecd202c..c858776183 100644
--- a/examples/utils_squad.py
+++ b/examples/utils_squad.py
@@ -24,7 +24,7 @@ import math
 import collections
 from io import open
 
-from pytorch_pretrained_bert.tokenization_bert import BasicTokenizer, whitespace_tokenize
+from pytorch_transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
 
 logger = logging.getLogger(__name__)
 
diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
index 94c7a18a30..0ee0df6697 100644
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_bert import BertTokenizer
-from pytorch_pretrained_bert.modeling_bert import (
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.modeling_bert import (
         BertModel,
         BertForNextSentencePrediction,
         BertForMaskedLM,
@@ -86,7 +86,7 @@ def bertTokenizer(*args, **kwargs):
     Example:
         >>> import torch
         >>> sentence = 'Hello, World!'
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         >>> toks = tokenizer.tokenize(sentence)
         ['Hello', '##,', 'World', '##!']
         >>> ids = tokenizer.convert_tokens_to_ids(toks)
@@ -106,7 +106,7 @@ def bertModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -115,7 +115,7 @@ def bertModel(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
         >>> model.eval()
         # Predict hidden states features for each layer
         >>> with torch.no_grad():
@@ -135,7 +135,7 @@ def bertForNextSentencePrediction(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -144,7 +144,7 @@ def bertForNextSentencePrediction(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForNextSentencePrediction
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForNextSentencePrediction', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
         >>> model.eval()
         # Predict the next sentence classification logits
         >>> with torch.no_grad():
@@ -165,7 +165,7 @@ def bertForPreTraining(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -173,7 +173,7 @@ def bertForPreTraining(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForPreTraining
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForPreTraining', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
         >>> masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
     """
     model = BertForPreTraining.from_pretrained(*args, **kwargs)
@@ -189,7 +189,7 @@ def bertForMaskedLM(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -200,7 +200,7 @@ def bertForMaskedLM(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForMaskedLM
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
         >>> model.eval()
         # Predict all tokens
         >>> with torch.no_grad():
@@ -231,7 +231,7 @@ def bertForSequenceClassification(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -240,7 +240,7 @@ def bertForSequenceClassification(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForSequenceClassification
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
         >>> model.eval()
         # Predict the sequence classification logits
         >>> with torch.no_grad():
@@ -266,7 +266,7 @@ def bertForMultipleChoice(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -275,7 +275,7 @@ def bertForMultipleChoice(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
         >>> segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
         # Load bertForMultipleChoice
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
         >>> model.eval()
         # Predict the multiple choice logits
         >>> with torch.no_grad():
@@ -299,7 +299,7 @@ def bertForQuestionAnswering(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -308,7 +308,7 @@ def bertForQuestionAnswering(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForQuestionAnswering
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForQuestionAnswering', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
         >>> model.eval()
         # Predict the start and end positions logits
         >>> with torch.no_grad():
@@ -338,7 +338,7 @@ def bertForTokenClassification(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
         >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -347,7 +347,7 @@ def bertForTokenClassification(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
         >>> segments_tensors = torch.tensor([segments_ids])
         # Load bertForTokenClassification
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
         >>> model.eval()
         # Predict the token classification logits
         >>> with torch.no_grad():
diff --git a/hubconfs/gpt2_hubconf.py b/hubconfs/gpt2_hubconf.py
index 3ac8bc72ab..dbaa2cd612 100644
--- a/hubconfs/gpt2_hubconf.py
+++ b/hubconfs/gpt2_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer
-from pytorch_pretrained_bert.modeling_gpt2 import (
+from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
+from pytorch_transformers.modeling_gpt2 import (
     GPT2Model,
     GPT2LMHeadModel,
     GPT2DoubleHeadsModel
@@ -53,7 +53,7 @@ def gpt2Tokenizer(*args, **kwargs):
 
     Example:
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
 
         >>> text = "Who was Jim Henson ?"
         >>> indexed_tokens = tokenizer.encode(tokenized_text)
@@ -72,7 +72,7 @@ def gpt2Model(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -83,7 +83,7 @@ def gpt2Model(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load gpt2Model
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Model', 'gpt2')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Model', 'gpt2')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -105,7 +105,7 @@ def gpt2LMHeadModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -116,7 +116,7 @@ def gpt2LMHeadModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load gpt2LMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2LMHeadModel', 'gpt2')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -144,7 +144,7 @@ def gpt2DoubleHeadsModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
 
         #  Prepare tokenized input
         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -157,7 +157,7 @@ def gpt2DoubleHeadsModel(*args, **kwargs):
         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
         # Load gpt2DoubleHeadsModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2DoubleHeadsModel', 'gpt2')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2DoubleHeadsModel', 'gpt2')
         >>> model.eval()
 
         # Predict hidden states features for each layer
diff --git a/hubconfs/gpt_hubconf.py b/hubconfs/gpt_hubconf.py
index f3d03888ae..1683c881fa 100644
--- a/hubconfs/gpt_hubconf.py
+++ b/hubconfs/gpt_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
-from pytorch_pretrained_bert.modeling_openai import (
+from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
+from pytorch_transformers.modeling_openai import (
 	OpenAIGPTModel,
 	OpenAIGPTLMHeadModel,
 	OpenAIGPTDoubleHeadsModel
@@ -77,7 +77,7 @@ def openAIGPTTokenizer(*args, **kwargs):
 
     Example:
 		>>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 		
 		>>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
         >>> tokenized_text = tokenizer.tokenize(text)
@@ -98,7 +98,7 @@ def openAIGPTModel(*args, **kwargs):
     Example:
         # Load the tokenizer
 		>>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 
         #  Prepare tokenized input
         >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -107,7 +107,7 @@ def openAIGPTModel(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
 
         # Load openAIGPTModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTModel', 'openai-gpt')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTModel', 'openai-gpt')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -127,7 +127,7 @@ def openAIGPTLMHeadModel(*args, **kwargs):
 	Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 
         #  Prepare tokenized input
         >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -136,7 +136,7 @@ def openAIGPTLMHeadModel(*args, **kwargs):
         >>> tokens_tensor = torch.tensor([indexed_tokens])
 
         # Load openAIGPTLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTLMHeadModel', 'openai-gpt')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTLMHeadModel', 'openai-gpt')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -162,7 +162,7 @@ def openAIGPTDoubleHeadsModel(*args, **kwargs):
 	Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 
         #  Prepare tokenized input
         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -175,7 +175,7 @@ def openAIGPTDoubleHeadsModel(*args, **kwargs):
         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
         # Load openAIGPTDoubleHeadsModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
         >>> model.eval()
 
         # Predict hidden states features for each layer
diff --git a/hubconfs/transformer_xl_hubconf.py b/hubconfs/transformer_xl_hubconf.py
index d5c697547e..d89db894ad 100644
--- a/hubconfs/transformer_xl_hubconf.py
+++ b/hubconfs/transformer_xl_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer
-from pytorch_pretrained_bert.modeling_transfo_xl import (
+from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer
+from pytorch_transformers.modeling_transfo_xl import (
     TransfoXLModel,
     TransfoXLLMHeadModel
 )
@@ -46,7 +46,7 @@ def transformerXLTokenizer(*args, **kwargs):
 
     Example:
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
         
         >>> text = "Who was Jim Henson ?"
         >>> tokenized_text = tokenizer.tokenize(tokenized_text)
@@ -64,7 +64,7 @@ def transformerXLModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -77,7 +77,7 @@ def transformerXLModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load transformerXLModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLModel', 'transfo-xl-wt103')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLModel', 'transfo-xl-wt103')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -99,7 +99,7 @@ def transformerXLLMHeadModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -112,7 +112,7 @@ def transformerXLLMHeadModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load transformerXLLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
         >>> model.eval()
 
         # Predict hidden states features for each layer
diff --git a/hubconfs/xlm_hubconf.py b/hubconfs/xlm_hubconf.py
index 154f875bfb..4f6fd93c24 100644
--- a/hubconfs/xlm_hubconf.py
+++ b/hubconfs/xlm_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_xlm import XLMTokenizer
-from pytorch_pretrained_bert.modeling_xlm import (
+from pytorch_transformers.tokenization_xlm import XLMTokenizer
+from pytorch_transformers.modeling_xlm import (
     XLMConfig,
     XLMModel,
     XLMWithLMHeadModel,
@@ -18,7 +18,7 @@ xlm_start_docstring = """
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlmTokenizer', 'xlm-mlm-en-2048')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -77,7 +77,7 @@ def xlmTokenizer(*args, **kwargs):
 
     Example:
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlmTokenizer', 'xlm-mlm-en-2048')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
 
         >>> text = "Who was Jim Henson ?"
         >>> indexed_tokens = tokenizer.encode(tokenized_text)
@@ -91,7 +91,7 @@ def xlmTokenizer(*args, **kwargs):
 def xlmModel(*args, **kwargs):
     """
         # Load xlmModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlmModel', 'xlm-mlm-en-2048')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -116,7 +116,7 @@ def xlmLMHeadModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load xlnetLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -143,7 +143,7 @@ def xlmLMHeadModel(*args, **kwargs):
 #     Example:
 #         # Load the tokenizer
 #         >>> import torch
-#         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlm-mlm-en-2048')
+#         >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
 
 #         #  Prepare tokenized input
 #         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -156,7 +156,7 @@ def xlmLMHeadModel(*args, **kwargs):
 #         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
 #         # Load xlnetForSequenceClassification
-#         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
+#         >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
 #         >>> model.eval()
 
 #         # Predict sequence classes logits
diff --git a/hubconfs/xlnet_hubconf.1.py b/hubconfs/xlnet_hubconf.1.py
index d3766d04e0..4c5105a241 100644
--- a/hubconfs/xlnet_hubconf.1.py
+++ b/hubconfs/xlnet_hubconf.1.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
-from pytorch_pretrained_bert.modeling_xlnet import (
+from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
+from pytorch_transformers.modeling_xlnet import (
     XLNetConfig,
     XLNetModel,
     XLNetLMHeadModel,
@@ -54,7 +54,7 @@ def xlnetTokenizer(*args, **kwargs):
 
     Example:
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 
         >>> text = "Who was Jim Henson ?"
         >>> indexed_tokens = tokenizer.encode(tokenized_text)
@@ -73,7 +73,7 @@ def xlnetModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -84,7 +84,7 @@ def xlnetModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load xlnetModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetModel', 'xlnet-large-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -107,7 +107,7 @@ def xlnetLMHeadModel(*args, **kwargs):
     Example:
         # Load the tokenizer
         >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 
         #  Prepare tokenized input
         >>> text_1 = "Who was Jim Henson ?"
@@ -118,7 +118,7 @@ def xlnetLMHeadModel(*args, **kwargs):
         >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load xlnetLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetLMHeadModel', 'xlnet-large-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
         >>> model.eval()
 
         # Predict hidden states features for each layer
@@ -145,7 +145,7 @@ def xlnetLMHeadModel(*args, **kwargs):
 #     Example:
 #         # Load the tokenizer
 #         >>> import torch
-#         >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
+#         >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 
 #         #  Prepare tokenized input
 #         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -158,7 +158,7 @@ def xlnetLMHeadModel(*args, **kwargs):
 #         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
 #         # Load xlnetForSequenceClassification
-#         >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetForSequenceClassification', 'xlnet-large-cased')
+#         >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
 #         >>> model.eval()
 
 #         # Predict sequence classes logits
diff --git a/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb b/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
index ea7271df96..809f6ea6e0 100644
--- a/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
+++ b/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
@@ -78,7 +78,7 @@
     "import importlib.util\n",
     "import sys\n",
     "import tensorflow as tf\n",
-    "import pytorch_pretrained_bert as ppb\n",
+    "import pytorch_transformers as ppb\n",
     "\n",
     "def del_all_flags(FLAGS):\n",
     "    flags_dict = FLAGS._flags()    \n",
@@ -3997,9 +3997,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "11/16/2018 11:03:05 - INFO - pytorch_pretrained_bert.modeling_bert -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/thomaswolf/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba\n",
-      "11/16/2018 11:03:05 - INFO - pytorch_pretrained_bert.modeling_bert -   extracting archive file /Users/thomaswolf/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpaqgsm566\n",
-      "11/16/2018 11:03:08 - INFO - pytorch_pretrained_bert.modeling_bert -   Model config {\n",
+      "11/16/2018 11:03:05 - INFO - pytorch_transformers.modeling_bert -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/thomaswolf/.pytorch_transformers/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba\n",
+      "11/16/2018 11:03:05 - INFO - pytorch_transformers.modeling_bert -   extracting archive file /Users/thomaswolf/.pytorch_transformers/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpaqgsm566\n",
+      "11/16/2018 11:03:08 - INFO - pytorch_transformers.modeling_bert -   Model config {\n",
       "  \"attention_probs_dropout_prob\": 0.1,\n",
       "  \"hidden_act\": \"gelu\",\n",
       "  \"hidden_dropout_prob\": 0.1,\n",
diff --git a/notebooks/Comparing-TF-and-PT-models.ipynb b/notebooks/Comparing-TF-and-PT-models.ipynb
index 3e438e2f55..b7382e4652 100644
--- a/notebooks/Comparing-TF-and-PT-models.ipynb
+++ b/notebooks/Comparing-TF-and-PT-models.ipynb
@@ -342,7 +342,7 @@
    "outputs": [],
    "source": [
     "import extract_features\n",
-    "import pytorch_pretrained_bert as ppb\n",
+    "import pytorch_transformers as ppb\n",
     "from extract_features import *"
    ]
   },
@@ -375,8 +375,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "11/15/2018 16:21:18 - INFO - pytorch_pretrained_bert.modeling_bert -   loading archive file ../../google_models/uncased_L-12_H-768_A-12/\n",
-      "11/15/2018 16:21:18 - INFO - pytorch_pretrained_bert.modeling_bert -   Model config {\n",
+      "11/15/2018 16:21:18 - INFO - pytorch_transformers.modeling_bert -   loading archive file ../../google_models/uncased_L-12_H-768_A-12/\n",
+      "11/15/2018 16:21:18 - INFO - pytorch_transformers.modeling_bert -   Model config {\n",
       "  \"attention_probs_dropout_prob\": 0.1,\n",
       "  \"hidden_act\": \"gelu\",\n",
       "  \"hidden_dropout_prob\": 0.1,\n",
diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_transformers/__init__.py
similarity index 98%
rename from pytorch_pretrained_bert/__init__.py
rename to pytorch_transformers/__init__.py
index 23346967ba..cbd007f872 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.2"
+__version__ = "0.7.0"
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
diff --git a/pytorch_pretrained_bert/__main__.py b/pytorch_transformers/__main__.py
similarity index 72%
rename from pytorch_pretrained_bert/__main__.py
rename to pytorch_transformers/__main__.py
index bb9534a830..95504c1493 100644
--- a/pytorch_pretrained_bert/__main__.py
+++ b/pytorch_transformers/__main__.py
@@ -4,24 +4,24 @@ def main():
     if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet"]:
         print(
         "Should be used as one of: \n"
-        ">> `pytorch_pretrained_bert bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
-        ">> `pytorch_pretrained_bert gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n"
-        ">> `pytorch_pretrained_bert transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n"
-        ">> `pytorch_pretrained_bert gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]` or \n"
-        ">> `pytorch_pretrained_bert xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
+        ">> `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
+        ">> `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n"
+        ">> `pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n"
+        ">> `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]` or \n"
+        ">> `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
     else:
         if sys.argv[1] == "bert":
             try:
                 from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
             if len(sys.argv) != 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+                print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
             else:
                 PYTORCH_DUMP_OUTPUT = sys.argv.pop()
                 TF_CONFIG = sys.argv.pop()
@@ -31,7 +31,7 @@ def main():
             from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
             if len(sys.argv) < 4 or len(sys.argv) > 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
+                print("Should be used as `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
             else:
                 OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -46,13 +46,13 @@ def main():
             try:
                 from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
             if len(sys.argv) < 4 or len(sys.argv) > 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+                print("Should be used as `pytorch_transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
             else:
                 if 'ckpt' in sys.argv[2].lower():
                     TF_CHECKPOINT = sys.argv[2]
@@ -70,14 +70,14 @@ def main():
             try:
                 from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
             if len(sys.argv) < 4 or len(sys.argv) > 5:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+                print("Should be used as `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
             else:
                 TF_CHECKPOINT = sys.argv[2]
                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -90,14 +90,14 @@ def main():
             try:
                 from .convert_xlnet_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
             except ImportError:
-                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
             if len(sys.argv) < 5 or len(sys.argv) > 6:
                 # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
+                print("Should be used as `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
             else:
                 TF_CHECKPOINT = sys.argv[2]
                 TF_CONFIG = sys.argv[3]
diff --git a/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
similarity index 97%
rename from pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
index 51d52a6694..86c8264cb5 100755
--- a/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from io import open
 
 import torch
 
-from pytorch_pretrained_bert.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
+from pytorch_transformers.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
                                                      GPT2Config,
                                                      GPT2Model,
                                                      load_tf_weights_in_gpt2)
diff --git a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
similarity index 97%
rename from pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
index 566008aaa0..68e9dea624 100755
--- a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
@@ -21,7 +21,7 @@ from io import open
 
 import torch
 
-from pytorch_pretrained_bert.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
+from pytorch_transformers.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
                                                      OpenAIGPTConfig,
                                                      OpenAIGPTModel,
                                                      load_tf_weights_in_openai_gpt)
diff --git a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py b/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
similarity index 95%
rename from pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
index 42f7380969..7530d7e12d 100755
--- a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
@@ -25,7 +25,7 @@ import tensorflow as tf
 import torch
 import numpy as np
 
-from pytorch_pretrained_bert.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+from pytorch_transformers.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert
 
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
     # Initialise PyTorch model
diff --git a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
similarity index 96%
rename from pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
index 8d6b9651c7..2d666a1f03 100755
--- a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -23,13 +23,13 @@ from io import open
 
 import torch
 
-import pytorch_pretrained_bert.tokenization_transfo_xl as data_utils
-from pytorch_pretrained_bert.modeling_transfo_xl import (CONFIG_NAME,
+import pytorch_transformers.tokenization_transfo_xl as data_utils
+from pytorch_transformers.modeling_transfo_xl import (CONFIG_NAME,
                                                          WEIGHTS_NAME,
                                                          TransfoXLConfig,
                                                          TransfoXLLMHeadModel,
                                                          load_tf_weights_in_transfo_xl)
-from pytorch_pretrained_bert.tokenization_transfo_xl import (CORPUS_NAME,
+from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME,
                                                              VOCAB_NAME)
 
 if sys.version_info[0] == 2:
diff --git a/pytorch_pretrained_bert/convert_xlm_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
similarity index 93%
rename from pytorch_pretrained_bert/convert_xlm_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
index 44a40174b4..0cbe962cea 100755
--- a/pytorch_pretrained_bert/convert_xlm_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
@@ -23,8 +23,8 @@ from io import open
 import torch
 import numpy
 
-from pytorch_pretrained_bert.modeling_xlm import (CONFIG_NAME, WEIGHTS_NAME, XLMConfig, XLMModel)
-from pytorch_pretrained_bert.tokenization_xlm import MERGES_NAME, VOCAB_NAME
+from pytorch_transformers.modeling_xlm import (CONFIG_NAME, WEIGHTS_NAME, XLMConfig, XLMModel)
+from pytorch_transformers.tokenization_xlm import MERGES_NAME, VOCAB_NAME
 
 
 def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
diff --git a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
similarity index 98%
rename from pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
rename to pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
index ce4fcc7810..f41db87124 100755
--- a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
@@ -22,7 +22,7 @@ import os
 import argparse
 import torch
 
-from pytorch_pretrained_bert.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME,
+from pytorch_transformers.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME,
                                                     XLNetConfig,
                                                     XLNetLMHeadModel, XLNetForQuestionAnswering,
                                                     XLNetForSequenceClassification,
diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_transformers/file_utils.py
similarity index 99%
rename from pytorch_pretrained_bert/file_utils.py
rename to pytorch_transformers/file_utils.py
index 994f47d57c..1397bd416b 100644
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_transformers/file_utils.py
@@ -29,7 +29,7 @@ except ImportError:
     torch_cache_home = os.path.expanduser(
         os.getenv('TORCH_HOME', os.path.join(
             os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
-default_cache_path = os.path.join(torch_cache_home, 'pytorch_pretrained_bert')
+default_cache_path = os.path.join(torch_cache_home, 'pytorch_transformers')
 
 try:
     from urllib.parse import urlparse
diff --git a/pytorch_pretrained_bert/model_utils.py b/pytorch_transformers/model_utils.py
similarity index 100%
rename from pytorch_pretrained_bert/model_utils.py
rename to pytorch_transformers/model_utils.py
diff --git a/pytorch_pretrained_bert/modeling_bert.py b/pytorch_transformers/modeling_bert.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_bert.py
rename to pytorch_transformers/modeling_bert.py
diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_gpt2.py
rename to pytorch_transformers/modeling_gpt2.py
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_transformers/modeling_openai.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_openai.py
rename to pytorch_transformers/modeling_openai.py
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_transfo_xl.py
rename to pytorch_transformers/modeling_transfo_xl.py
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py b/pytorch_transformers/modeling_transfo_xl_utilities.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
rename to pytorch_transformers/modeling_transfo_xl_utilities.py
diff --git a/pytorch_pretrained_bert/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
similarity index 99%
rename from pytorch_pretrained_bert/modeling_xlm.py
rename to pytorch_transformers/modeling_xlm.py
index 9d1775161d..6decba3cce 100644
--- a/pytorch_pretrained_bert/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -204,7 +204,7 @@ def gelu(x):
     GELU activation
     https://arxiv.org/abs/1606.08415
     https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14
-    https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/modeling.py
+    https://github.com/huggingface/pytorch-transformers/blob/master/modeling.py
     """
     # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
     return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
similarity index 100%
rename from pytorch_pretrained_bert/modeling_xlnet.py
rename to pytorch_transformers/modeling_xlnet.py
diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_transformers/optimization.py
similarity index 100%
rename from pytorch_pretrained_bert/optimization.py
rename to pytorch_transformers/optimization.py
diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_transformers/optimization_openai.py
similarity index 100%
rename from pytorch_pretrained_bert/optimization_openai.py
rename to pytorch_transformers/optimization_openai.py
diff --git a/pytorch_pretrained_bert/tests/__init__.py b/pytorch_transformers/tests/__init__.py
similarity index 100%
rename from pytorch_pretrained_bert/tests/__init__.py
rename to pytorch_transformers/tests/__init__.py
diff --git a/pytorch_pretrained_bert/tests/conftest.py b/pytorch_transformers/tests/conftest.py
similarity index 100%
rename from pytorch_pretrained_bert/tests/conftest.py
rename to pytorch_transformers/tests/conftest.py
diff --git a/pytorch_pretrained_bert/tests/fixtures/input.txt b/pytorch_transformers/tests/fixtures/input.txt
similarity index 100%
rename from pytorch_pretrained_bert/tests/fixtures/input.txt
rename to pytorch_transformers/tests/fixtures/input.txt
diff --git a/pytorch_pretrained_bert/tests/fixtures/sample_text.txt b/pytorch_transformers/tests/fixtures/sample_text.txt
similarity index 100%
rename from pytorch_pretrained_bert/tests/fixtures/sample_text.txt
rename to pytorch_transformers/tests/fixtures/sample_text.txt
diff --git a/pytorch_pretrained_bert/tests/fixtures/test_sentencepiece.model b/pytorch_transformers/tests/fixtures/test_sentencepiece.model
similarity index 100%
rename from pytorch_pretrained_bert/tests/fixtures/test_sentencepiece.model
rename to pytorch_transformers/tests/fixtures/test_sentencepiece.model
diff --git a/pytorch_pretrained_bert/tests/model_tests_commons.py b/pytorch_transformers/tests/model_tests_commons.py
similarity index 99%
rename from pytorch_pretrained_bert/tests/model_tests_commons.py
rename to pytorch_transformers/tests/model_tests_commons.py
index e7c97a0787..b831f85552 100644
--- a/pytorch_pretrained_bert/tests/model_tests_commons.py
+++ b/pytorch_transformers/tests/model_tests_commons.py
@@ -412,7 +412,7 @@ class GPTModelTester(object):
             [[], []])
 
     def create_and_check_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(self.base_model_class.PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/model_utils_test.py b/pytorch_transformers/tests/model_utils_test.py
similarity index 89%
rename from pytorch_pretrained_bert/tests/model_utils_test.py
rename to pytorch_transformers/tests/model_utils_test.py
index 59f076fa00..120df35f82 100644
--- a/pytorch_pretrained_bert/tests/model_utils_test.py
+++ b/pytorch_transformers/tests/model_utils_test.py
@@ -25,8 +25,8 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import PretrainedConfig, PreTrainedModel
-from pytorch_pretrained_bert.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP, PRETRAINED_CONFIG_ARCHIVE_MAP
+from pytorch_transformers import PretrainedConfig, PreTrainedModel
+from pytorch_transformers.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP, PRETRAINED_CONFIG_ARCHIVE_MAP
 
 
 class ModelUtilsTest(unittest.TestCase):
diff --git a/pytorch_pretrained_bert/tests/modeling_bert_test.py b/pytorch_transformers/tests/modeling_bert_test.py
similarity index 98%
rename from pytorch_pretrained_bert/tests/modeling_bert_test.py
rename to pytorch_transformers/tests/modeling_bert_test.py
index 7a9d49fde7..b140f5e647 100644
--- a/pytorch_pretrained_bert/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -20,11 +20,11 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM,
+from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
                                      BertForNextSentencePrediction, BertForPreTraining,
                                      BertForQuestionAnswering, BertForSequenceClassification,
                                      BertForTokenClassification, BertForMultipleChoice)
-from pytorch_pretrained_bert.modeling_bert import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_bert import PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .model_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
 
@@ -266,7 +266,7 @@ class BertModelTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py
similarity index 96%
rename from pytorch_pretrained_bert/tests/modeling_gpt2_test.py
rename to pytorch_transformers/tests/modeling_gpt2_test.py
index 122cdf3c7b..4ace52571a 100644
--- a/pytorch_pretrained_bert/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -25,7 +25,7 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import (GPT2Config, GPT2Model,
+from pytorch_transformers import (GPT2Config, GPT2Model,
                                      GPT2LMHeadModel, GPT2DoubleHeadsModel)
 
 from .model_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
diff --git a/pytorch_pretrained_bert/tests/modeling_openai_test.py b/pytorch_transformers/tests/modeling_openai_test.py
similarity index 96%
rename from pytorch_pretrained_bert/tests/modeling_openai_test.py
rename to pytorch_transformers/tests/modeling_openai_test.py
index 627bc564de..fe81157023 100644
--- a/pytorch_pretrained_bert/tests/modeling_openai_test.py
+++ b/pytorch_transformers/tests/modeling_openai_test.py
@@ -21,7 +21,7 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,
+from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
                                      OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
 
 from .model_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
diff --git a/pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py b/pytorch_transformers/tests/modeling_transfo_xl_test.py
similarity index 97%
rename from pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py
rename to pytorch_transformers/tests/modeling_transfo_xl_test.py
index caeb25b412..d15a19eb64 100644
--- a/pytorch_pretrained_bert/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -25,8 +25,8 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
-from pytorch_pretrained_bert.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
+from pytorch_transformers.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .model_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
 
@@ -184,7 +184,7 @@ class TransfoXLModelTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py
similarity index 97%
rename from pytorch_pretrained_bert/tests/modeling_xlm_test.py
rename to pytorch_transformers/tests/modeling_xlm_test.py
index 3e442a09fb..8a8905cc31 100644
--- a/pytorch_pretrained_bert/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -20,8 +20,8 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_pretrained_bert import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
-from pytorch_pretrained_bert.modeling_xlm import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
+from pytorch_transformers.modeling_xlm import PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .model_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
 
@@ -250,7 +250,7 @@ class XLMModelTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py
similarity index 97%
rename from pytorch_pretrained_bert/tests/modeling_xlnet_test.py
rename to pytorch_transformers/tests/modeling_xlnet_test.py
index 58617cf7b9..b9d55a26c7 100644
--- a/pytorch_pretrained_bert/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -25,8 +25,8 @@ import pytest
 
 import torch
 
-from pytorch_pretrained_bert import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
-from pytorch_pretrained_bert.modeling_xlnet import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
+from pytorch_transformers.modeling_xlnet import PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .model_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
 
@@ -278,7 +278,7 @@ class XLNetModelTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = XLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/optimization_test.py b/pytorch_transformers/tests/optimization_test.py
similarity index 94%
rename from pytorch_pretrained_bert/tests/optimization_test.py
rename to pytorch_transformers/tests/optimization_test.py
index c6924bd4bc..dfbbd44b6e 100644
--- a/pytorch_pretrained_bert/tests/optimization_test.py
+++ b/pytorch_transformers/tests/optimization_test.py
@@ -20,9 +20,9 @@ import unittest
 
 import torch
 
-from pytorch_pretrained_bert import BertAdam
-from pytorch_pretrained_bert import OpenAIAdam
-from pytorch_pretrained_bert.optimization import ConstantLR, WarmupLinearSchedule, WarmupConstantSchedule, \
+from pytorch_transformers import BertAdam
+from pytorch_transformers import OpenAIAdam
+from pytorch_transformers.optimization import ConstantLR, WarmupLinearSchedule, WarmupConstantSchedule, \
     WarmupCosineWithWarmupRestartsSchedule, WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule
 import numpy as np
 
diff --git a/pytorch_pretrained_bert/tests/tokenization_bert_test.py b/pytorch_transformers/tests/tokenization_bert_test.py
similarity index 97%
rename from pytorch_pretrained_bert/tests/tokenization_bert_test.py
rename to pytorch_transformers/tests/tokenization_bert_test.py
index 3d0b4323b2..59a87a4cb9 100644
--- a/pytorch_pretrained_bert/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -20,7 +20,7 @@ from io import open
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_bert import (BasicTokenizer,
+from pytorch_transformers.tokenization_bert import (BasicTokenizer,
                                                   BertTokenizer,
                                                   WordpieceTokenizer,
                                                   _is_control, _is_punctuation,
@@ -51,7 +51,7 @@ class TokenizationTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
similarity index 94%
rename from pytorch_pretrained_bert/tests/tokenization_gpt2_test.py
rename to pytorch_transformers/tests/tokenization_gpt2_test.py
index 70f69a1f23..c6d926bdd4 100644
--- a/pytorch_pretrained_bert/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -20,7 +20,7 @@ import json
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
 from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -58,7 +58,7 @@ class GPT2TokenizationTest(unittest.TestCase):
 
     # @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/tokenization_openai_test.py b/pytorch_transformers/tests/tokenization_openai_test.py
similarity index 94%
rename from pytorch_pretrained_bert/tests/tokenization_openai_test.py
rename to pytorch_transformers/tests/tokenization_openai_test.py
index 6ae72858a7..38315f927b 100644
--- a/pytorch_pretrained_bert/tests/tokenization_openai_test.py
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -20,7 +20,7 @@ import json
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -60,7 +60,7 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
similarity index 100%
rename from pytorch_pretrained_bert/tests/tokenization_tests_commons.py
rename to pytorch_transformers/tests/tokenization_tests_commons.py
diff --git a/pytorch_pretrained_bert/tests/tokenization_transfo_xl_test.py b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
similarity index 93%
rename from pytorch_pretrained_bert/tests/tokenization_transfo_xl_test.py
rename to pytorch_transformers/tests/tokenization_transfo_xl_test.py
index a5ff30ab6e..f744e319c8 100644
--- a/pytorch_pretrained_bert/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -20,7 +20,7 @@ from io import open
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -61,7 +61,7 @@ class TransfoXLTokenizationTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = TransfoXLTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/tokenization_xlm_test.py b/pytorch_transformers/tests/tokenization_xlm_test.py
similarity index 94%
rename from pytorch_pretrained_bert/tests/tokenization_xlm_test.py
rename to pytorch_transformers/tests/tokenization_xlm_test.py
index 3b2db8ea1f..9cc18f3d60 100644
--- a/pytorch_pretrained_bert/tests/tokenization_xlm_test.py
+++ b/pytorch_transformers/tests/tokenization_xlm_test.py
@@ -20,7 +20,7 @@ import json
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_xlm import XLMTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_xlm import XLMTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -59,7 +59,7 @@ class XLMTokenizationTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = XLMTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tests/tokenization_xlnet_test.py b/pytorch_transformers/tests/tokenization_xlnet_test.py
similarity index 97%
rename from pytorch_pretrained_bert/tests/tokenization_xlnet_test.py
rename to pytorch_transformers/tests/tokenization_xlnet_test.py
index 9b6dd5a6c4..4dd76e114b 100644
--- a/pytorch_pretrained_bert/tests/tokenization_xlnet_test.py
+++ b/pytorch_transformers/tests/tokenization_xlnet_test.py
@@ -19,7 +19,7 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_pretrained_bert.tokenization_xlnet import (XLNetTokenizer,
+from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer,
                                                         PRETRAINED_VOCAB_ARCHIVE_MAP,
                                                         SPIECE_UNDERLINE)
 
@@ -62,7 +62,7 @@ class XLNetTokenizationTest(unittest.TestCase):
 
     @pytest.mark.slow
     def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
+        cache_dir = "/tmp/pytorch_transformers_test/"
         for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
             tokenizer = XLNetTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
diff --git a/pytorch_pretrained_bert/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_bert.py
rename to pytorch_transformers/tokenization_bert.py
diff --git a/pytorch_pretrained_bert/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_gpt2.py
rename to pytorch_transformers/tokenization_gpt2.py
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_openai.py
rename to pytorch_transformers/tokenization_openai.py
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_transfo_xl.py
rename to pytorch_transformers/tokenization_transfo_xl.py
diff --git a/pytorch_pretrained_bert/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_xlm.py
rename to pytorch_transformers/tokenization_xlm.py
diff --git a/pytorch_pretrained_bert/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
similarity index 100%
rename from pytorch_pretrained_bert/tokenization_xlnet.py
rename to pytorch_transformers/tokenization_xlnet.py
diff --git a/setup.py b/setup.py
index 28e85a0068..09b8c01ad5 100644
--- a/setup.py
+++ b/setup.py
@@ -37,16 +37,16 @@ from io import open
 from setuptools import find_packages, setup
 
 setup(
-    name="pytorch_pretrained_bert",
-    version="0.6.2",
-    author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
+    name="pytorch_transformers",
+    version="0.7.0",
+    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
     author_email="thomas@huggingface.co",
     description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
     long_description=open("README.md", "r", encoding='utf-8').read(),
     long_description_content_type="text/markdown",
     keywords='BERT NLP deep learning google',
     license='Apache',
-    url="https://github.com/huggingface/pytorch-pretrained-BERT",
+    url="https://github.com/huggingface/pytorch-transformers",
     packages=find_packages(exclude=["*.tests", "*.tests.*",
                                     "tests.*", "tests"]),
     install_requires=['torch>=0.4.1',
@@ -58,7 +58,7 @@ setup(
                       'sentencepiece'],
     entry_points={
       'console_scripts': [
-        "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main",
+        "pytorch_transformers=pytorch_transformers.__main__:main",
       ]
     },
     # python_requires='>=3.5.0',

From 0231ba291e585cca2d01b723202d40ebdd5541b7 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 11:59:04 +0200
Subject: [PATCH 02/17] circle-ci

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b1e80edc89..77a7a9a88a 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # PyTorch Pretrained BERT: The Big & Extending Repository of pretrained Transformers
 
-[![CircleCI](https://circleci.com/gh/huggingface/pytorch-transformers.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-transformers)
+[![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-bert.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-bert)
 
 This repository contains op-for-op PyTorch reimplementations, pre-trained models and fine-tuning examples for:
 

From eb91f6437e539f2434b43d6107b8a021453c3f0d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 12:30:15 +0200
Subject: [PATCH 03/17] update readme and setup

---
 README.md | 130 ++++++++++++++++++++++++++++++++++++------------------
 setup.py  |   4 +-
 2 files changed, 89 insertions(+), 45 deletions(-)

diff --git a/README.md b/README.md
index 77a7a9a88a..ef0d3ba3bc 100644
--- a/README.md
+++ b/README.md
@@ -2,34 +2,38 @@
 
 [![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-bert.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-bert)
 
-This repository contains op-for-op PyTorch reimplementations, pre-trained models and fine-tuning examples for:
+This repository contains op-for-op PyTorch implementations, pre-trained models and fine-tuning examples for:
 
 - [Google's BERT model](https://github.com/google-research/bert),
 - [OpenAI's GPT model](https://github.com/openai/finetune-transformer-lm),
-- [Google/CMU's Transformer-XL model](https://github.com/kimiyoung/transformer-xl), and
 - [OpenAI's GPT-2 model](https://blog.openai.com/better-language-models/).
+- [Google/CMU's Transformer-XL model](https://github.com/kimiyoung/transformer-xl), and
+- [Google/CMU's XLNet model](https://github.com/zihangdai/xlnet/).
+- [Facebook's XLM model](https://github.com/facebookresearch/XLM/).
 
 These implementations have been tested on several datasets (see the examples) and should match the performances of the associated TensorFlow implementations (e.g. ~91 F1 on SQuAD for BERT, ~88 F1 on RocStories for OpenAI GPT and ~18.3 perplexity on WikiText 103 for the Transformer-XL). You can find more details in the [Examples](#examples) section below.
 
 Here are some information on these models:
 
-**BERT** was released together with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-This PyTorch implementation of BERT is provided with [Google's pre-trained models](https://github.com/google-research/bert), examples, notebooks and a command-line interface to load any pre-trained TensorFlow checkpoint for BERT is also provided.
+**BERT** was released together with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. This PyTorch implementation of BERT is provided with [Google's pre-trained models](https://github.com/google-research/bert), examples, notebooks and a command-line interface to load any pre-trained TensorFlow checkpoint for BERT is also provided.
 
-**OpenAI GPT** was released together with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-This PyTorch implementation of OpenAI GPT is an adaptation of the [PyTorch implementation by HuggingFace](https://github.com/huggingface/pytorch-openai-transformer-lm) and is provided with [OpenAI's pre-trained model](https://github.com/openai/finetune-transformer-lm) and a command-line interface that was used to convert the pre-trained NumPy checkpoint in PyTorch.
+**OpenAI GPT** was released together with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. This PyTorch implementation of OpenAI GPT is an adaptation of the [PyTorch implementation by HuggingFace](https://github.com/huggingface/pytorch-openai-transformer-lm) and is provided with [OpenAI's pre-trained model](https://github.com/openai/finetune-transformer-lm) and a command-line interface that was used to convert the pre-trained NumPy checkpoint in PyTorch.
 
-**Google/CMU's Transformer-XL** was released together with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](http://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-This PyTorch implementation of Transformer-XL is an adaptation of the original [PyTorch implementation](https://github.com/kimiyoung/transformer-xl) which has been slightly modified to match the performances of the TensorFlow implementation and allow to re-use the pretrained weights. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
+**OpenAI GPT-2** was released together with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. This PyTorch implementation of OpenAI GPT-2 is an adaptation of the [OpenAI's implementation](https://github.com/openai/gpt-2) and is provided with [OpenAI's pre-trained model](https://github.com/openai/gpt-2) and a command-line interface that was used to convert the TensorFlow checkpoint in PyTorch.
 
-**OpenAI GPT-2** was released together with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-This PyTorch implementation of OpenAI GPT-2 is an adaptation of the [OpenAI's implementation](https://github.com/openai/gpt-2) and is provided with [OpenAI's pre-trained model](https://github.com/openai/gpt-2) and a command-line interface that was used to convert the TensorFlow checkpoint in PyTorch.
+**Google/CMU's Transformer-XL** was released together with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+This PyTorch implementation of XLNet is an adaptation of the original [PyTorch implementation](https://github.com/kimiyoung/transformer-xl) which has been slightly modified to match the performances of the TensorFlow implementation and allow to re-use the pretrained weights. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
 
+**Google/CMU's XLNet** was released together with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](http://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+This PyTorch implementation of XLNet is provided with [Google/CMU's pre-trained models](https://github.com/zihangdai/xlnet) and examples. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
+
+**Facebook's XLM** was released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+This PyTorch implementation of XLM is an adaptation of the original [PyTorch implementation](https://github.com/facebookresearch/XLM). A command-line interface is provided to convert original PyTorch checkpoints in PyTorch models according to the present repository.
 
 ## Content
 
 | Section | Description |
-|-|-|
+| - | - |
 | [Installation](#installation) | How to install the package |
 | [Overview](#overview) | Overview of the package |
 | [Usage](#usage) | Quickstart examples |
@@ -46,11 +50,13 @@ This repo was tested on Python 2.7 and 3.5+ (examples are tested only on python
 ### With pip
 
 PyTorch pretrained bert can be installed by pip as follows:
+
 ```bash
 pip install pytorch-transformers
 ```
 
 If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (limit to version 4.4.3 if you are using Python 2) and `SpaCy` :
+
 ```bash
 pip install spacy ftfy==4.4.3
 python -m spacy download en
@@ -61,11 +67,13 @@ If you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default
 ### From source
 
 Clone the repository and run:
+
 ```bash
 pip install [--editable] .
 ```
 
 Here also, if you want to reproduce the original tokenization process of the `OpenAI GPT` model, you will need to install `ftfy` (limit to version 4.4.3 if you are using Python 2) and `SpaCy` :
+
 ```bash
 pip install spacy ftfy==4.4.3
 python -m spacy download en
@@ -76,6 +84,7 @@ Again, if you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will
 A series of tests is included in the [tests folder](https://github.com/huggingface/pytorch-transformers/tree/master/tests) and can be run using `pytest` (install pytest if needed: `pip install pytest`).
 
 You can run the tests with the command:
+
 ```bash
 python -m pytest -sv tests/
 ```
@@ -500,7 +509,6 @@ with torch.no_grad():
     lm_logits, multiple_choice_logits, past = model(tokens_tensor, mc_token_ids)
 ```
 
-
 ## Doc
 
 Here is a detailed documentation of the classes in the package and how to use them:
@@ -559,12 +567,12 @@ where
 - `state_dict`: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
 - `*inputs`, `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
 
-
 `Uncased` means that the text has been lowercased before WordPiece tokenization, e.g., `John Smith` becomes `john smith`. The Uncased model also strips out any accent markers. `Cased` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the [Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md) or the original TensorFlow repository.
 
 **When using an `uncased model`, make sure to pass `--do_lower_case` to the example training scripts (or pass `do_lower_case=True` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).**
 
 Examples:
+
 ```python
 # BERT
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
@@ -720,6 +728,7 @@ The inputs and output are **identical to the TensorFlow model inputs and outputs
 
 We detail them here. This model takes as *inputs*:
 [`modeling.py`](./pytorch_transformers/modeling.py)
+
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary (see the tokens preprocessing logic in the scripts [`run_bert_extract_features.py`](./examples/run_bert_extract_features.py), [`run_bert_classifier.py`](./examples/run_bert_classifier.py) and [`run_bert_squad.py`](./examples/run_bert_squad.py)), and
 - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
 - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. It's a mask to be used if some input sequence lengths are smaller than the max input sequence length of the current batch. It's the mask that we typically use for attention when a batch has varying length sentences.
@@ -759,7 +768,6 @@ An example on how to use this class is given in the [`run_bert_extract_features.
 
 An example on how to use this class is given in the [`run_lm_finetuning.py`](./examples/run_lm_finetuning.py) script which can be used to fine-tune the BERT language model on your specific different text corpus. This should improve model performance, if the language style is different from the original BERT training corpus (Wiki + BookCorpus).
 
-
 #### 3. `BertForMaskedLM`
 
 `BertForMaskedLM` includes the `BertModel` Transformer followed by the (possibly) pre-trained  masked language modeling head.
@@ -853,6 +861,7 @@ The inputs and output are **identical to the TensorFlow model inputs and outputs
 
 We detail them here. This model takes as *inputs*:
 [`modeling_openai.py`](./pytorch_transformers/modeling_openai.py)
+
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
 - `position_ids`: an optional torch.LongTensor with the same shape as input_ids
     with the position indices (selected in the range [0, config.n_positions - 1[.
@@ -862,6 +871,7 @@ We detail them here. This model takes as *inputs*:
 - `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 0.0 => head is fully masked, 1.0 => head is not masked.
 
 This model *outputs*:
+
 - `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings) as torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
 
 #### 10. `OpenAIGPTLMHeadModel`
@@ -869,9 +879,11 @@ This model *outputs*:
 `OpenAIGPTLMHeadModel` includes the `OpenAIGPTModel` Transformer followed by a language modeling head with weights tied to the input embeddings (no additional parameters).
 
 *Inputs* are the same as the inputs of the [`OpenAIGPTModel`](#-9.-`OpenAIGPTModel`) class plus optional labels:
+
 - `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size].
 
 *Outputs*:
+
 - if `lm_labels` is not `None`:
   Outputs the language modeling loss.
 - else:
@@ -880,15 +892,18 @@ This model *outputs*:
 #### 11. `OpenAIGPTDoubleHeadsModel`
 
 `OpenAIGPTDoubleHeadsModel` includes the `OpenAIGPTModel` Transformer followed by two heads:
+
 - a language modeling head with weights tied to the input embeddings (no additional parameters) and:
 - a multiple choice classifier (linear layer that take as input a hidden state in a sequence to compute a score, see details in paper).
 
 *Inputs* are the same as the inputs of the [`OpenAIGPTModel`](#-9.-`OpenAIGPTModel`) class plus a classification mask and two optional labels:
+
 - `multiple_choice_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token whose hidden state should be used as input for the multiple choice classifier (usually the [CLS] token for each choice).
 - `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size].
 - `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size] with indices selected in [0, ..., num_choices].
 
 *Outputs*:
+
 - if `lm_labels` and `multiple_choice_labels` are not `None`:
   Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
 - else Outputs a tuple with:
@@ -906,14 +921,17 @@ Transformer XL use a relative positioning with sinusiodal patterns and adaptive
 
 This model takes as *inputs*:
 [`modeling_transfo_xl.py`](./pytorch_transformers/modeling_transfo_xl.py)
+
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the token indices selected in the range [0, self.config.n_token[
 - `mems`: an optional memory of hidden states from previous forward passes as a list (num layers) of hidden states at the entry of each layer. Each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
 
 This model *outputs* a tuple of (last_hidden_state, new_mems)
+
 - `last_hidden_state`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, self.config.d_model]
 - `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
 
-##### Extracting a list of the hidden states at each layer of the Transformer-XL from `last_hidden_state` and `new_mems`:
+##### Extracting a list of the hidden states at each layer of the Transformer-XL from `last_hidden_state` and `new_mems`
+
 The `new_mems` contain all the hidden states PLUS the output of the embeddings (`new_mems[0]`). `new_mems[-1]` is the output of the hidden state of the layer below the last layer and `last_hidden_state` is the output of the last layer (i.E. the input of the softmax when we have a language modeling head on top).
 
 There are two differences between the shapes of `new_mems` and `last_hidden_state`: `new_mems` have transposed first dimensions and are longer (of size `self.config.mem_len`). Here is how to extract the full list of hidden states from the model output:
@@ -930,11 +948,13 @@ all_hidden_states = lower_hidden_states + [hidden_states]
 `TransfoXLLMHeadModel` includes the `TransfoXLModel` Transformer followed by an (adaptive) softmax head with weights tied to the input embeddings.
 
 *Inputs* are the same as the inputs of the [`TransfoXLModel`](#-12.-`TransfoXLModel`) class plus optional labels:
+
 - `labels`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the labels token indices selected in the range [0, self.config.n_token[
 
 *Outputs* a tuple of (last_hidden_state, new_mems)
+
 - `softmax_output`: output of the (adaptive) softmax:
-  - if labels is None: log probabilities of tokens, shape [batch_size, sequence_length, n_tokens] 
+  - if labels is None: log probabilities of tokens, shape [batch_size, sequence_length, n_tokens]
   - else: Negative log likelihood of labels tokens with shape [batch_size, sequence_length]
 - `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
 
@@ -953,6 +973,7 @@ The inputs and output are **identical to the TensorFlow model inputs and outputs
 
 We detail them here. This model takes as *inputs*:
 [`modeling_gpt2.py`](./pytorch_transformers/modeling_gpt2.py)
+
 - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, vocab_size[
 - `position_ids`: an optional torch.LongTensor with the same shape as input_ids
     with the position indices (selected in the range [0, config.n_positions - 1[.
@@ -963,6 +984,7 @@ We detail them here. This model takes as *inputs*:
 - `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. It's a mask to be used to nullify some heads of the transformer. 0.0 => head is fully masked, 1.0 => head is not masked.
 
 This model *outputs*:
+
 - `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings) as torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
 - `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as a torch.FloatTensors. They can be reused to speed up sequential decoding (see the `run_gpt2.py` example).
 
@@ -971,9 +993,11 @@ This model *outputs*:
 `GPT2LMHeadModel` includes the `GPT2Model` Transformer followed by a language modeling head with weights tied to the input embeddings (no additional parameters).
 
 *Inputs* are the same as the inputs of the [`GPT2Model`](#-14.-`GPT2Model`) class plus optional labels:
+
 - `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size].
 
 *Outputs*:
+
 - if `lm_labels` is not `None`:
   Outputs the language modeling loss.
 - else: a tuple of
@@ -983,15 +1007,18 @@ This model *outputs*:
 #### 16. `GPT2DoubleHeadsModel`
 
 `GPT2DoubleHeadsModel` includes the `GPT2Model` Transformer followed by two heads:
+
 - a language modeling head with weights tied to the input embeddings (no additional parameters) and:
 - a multiple choice classifier (linear layer that take as input a hidden state in a sequence to compute a score, see details in paper).
 
 *Inputs* are the same as the inputs of the [`GPT2Model`](#-14.-`GPT2Model`) class plus a classification mask and two optional labels:
+
 - `multiple_choice_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token whose hidden state should be used as input for the multiple choice classifier (usually the [CLS] token for each choice).
 - `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size].
 - `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size] with indices selected in [0, ..., num_choices].
 
 *Outputs*:
+
 - if `lm_labels` and `multiple_choice_labels` are not `None`:
   Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
 - else Outputs a tuple with:
@@ -1108,30 +1135,32 @@ The differences with `BertAdam` is that `OpenAIAdam` compensate for bias as in t
 `OpenAIAdam` accepts the same arguments as `BertAdam`.
 
 #### Learning Rate Schedules
+
 The `.optimization` module also provides additional schedules in the form of schedule objects that inherit from `_LRSchedule`.
 All `_LRSchedule` subclasses accept `warmup` and `t_total` arguments at construction.
-When an `_LRSchedule` object is passed into `BertAdam` or `OpenAIAdam`, 
-the `warmup` and `t_total` arguments on the optimizer are ignored and the ones in the `_LRSchedule` object are used. 
+When an `_LRSchedule` object is passed into `BertAdam` or `OpenAIAdam`,
+the `warmup` and `t_total` arguments on the optimizer are ignored and the ones in the `_LRSchedule` object are used.
 An overview of the implemented schedules:
+
 - `ConstantLR`: always returns learning rate 1.
 - `WarmupConstantSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
     Keeps learning rate equal to 1. after warmup.
-    ![](docs/imgs/warmup_constant_schedule.png)
+    ![warmup constant schedule](docs/imgs/warmup_constant_schedule.png)
 - `WarmupLinearSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
     Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
-    ![](docs/imgs/warmup_linear_schedule.png)
--  `WarmupCosineSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
+    ![warmup linear schedule](docs/imgs/warmup_linear_schedule.png)
+- `WarmupCosineSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
     Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
     If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
-    ![](docs/imgs/warmup_cosine_schedule.png)
+    ![warmup cosine schedule](docs/imgs/warmup_cosine_schedule.png)
 - `WarmupCosineWithHardRestartsSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
     If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying learning rate (with hard restarts).
-    ![](docs/imgs/warmup_cosine_hard_restarts_schedule.png)
+    ![warmup cosine hard restarts schedule](docs/imgs/warmup_cosine_hard_restarts_schedule.png)
 - `WarmupCosineWithWarmupRestartsSchedule`: All training progress is divided in `cycles` (default=1.) parts of equal length.
     Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
     followed by a learning rate decreasing from 1. to 0. following a cosine curve.
     Note that the total number of all warmup steps over all cycles together is equal to `warmup` * `cycles`
-    ![](docs/imgs/warmup_cosine_warm_restarts_schedule.png)
+    ![warmup cosine warm restarts schedule](docs/imgs/warmup_cosine_warm_restarts_schedule.png)
 
 ## Examples
 
@@ -1158,9 +1187,11 @@ Here is how to use these techniques in our scripts:
 To use 16-bits training and distributed training, you need to install NVIDIA's apex extension [as detailed here](https://github.com/nvidia/apex). You will find more information regarding the internals of `apex` and how to use `apex` in [the doc and the associated repository](https://github.com/nvidia/apex). The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in [the relevant PR of the present repository](https://github.com/huggingface/pytorch-transformers/pull/116).
 
 Note: To use *Distributed Training*, you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see [the above mentioned blog post]((https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255)) for more details):
+
 ```bash
 python -m torch.distributed.launch --nproc_per_node=4 --nnodes=2 --node_rank=$THIS_MACHINE_INDEX --master_addr="192.168.1.1" --master_port=1234 run_bert_classifier.py (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
 ```
+
 Where `$THIS_MACHINE_INDEX` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address `192.168.1.1` and an open port `1234`.
 
 ### Fine-tuning with BERT: running the examples
@@ -1174,7 +1205,7 @@ We showcase several fine-tuning examples based on (and extended from) [the origi
 
 #### GLUE results on dev set
 
-We get the following results on the dev set of GLUE benchmark with an uncased BERT base 
+We get the following results on the dev set of GLUE benchmark with an uncased BERT base
 model. All experiments were run on a P100 GPU with a batch size of 32.
 
 | Task | Metric | Result |
@@ -1253,6 +1284,7 @@ Our test ran on a few seeds with [the original implementation hyper-parameters](
 **Fast run with apex and 16 bit precision: fine-tuning on MRPC in 27 seconds!**
 First install apex as indicated [here](https://github.com/NVIDIA/apex).
 Then run
+
 ```shell
 export GLUE_DIR=/path/to/glue
 
@@ -1279,6 +1311,7 @@ python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   -
 ```
 
 Training with these hyper-parameters gave us the following results:
+
 ```bash
   acc = 0.8823529411764706
   acc_and_f1 = 0.901702786377709
@@ -1310,16 +1343,15 @@ python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py   -
 
 This is the example of the `bert-large-uncased-whole-word-masking-finetuned-mnli` model
 
-
 #### SQuAD
 
 This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.
 
 The data for SQuAD can be downloaded with the following links and should be saved in a `$SQUAD_DIR` directory.
 
-*   [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
-*   [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
-*   [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
+- [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+- [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+- [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
 
 ```shell
 export SQUAD_DIR=/path/to/SQUAD
@@ -1340,12 +1372,13 @@ python run_bert_squad.py \
 ```
 
 Training with the previous hyper-parameters gave us the following results:
+
 ```bash
 python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json /tmp/debug_squad/predictions.json
 {"f1": 88.52381567990474, "exact_match": 81.22043519394512}
 ```
 
-**distributed training**
+##### distributed training
 
 Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
 
@@ -1368,6 +1401,7 @@ python -m torch.distributed.launch --nproc_per_node=8 \
 ```
 
 Training with these hyper-parameters gave us the following results:
+
 ```bash
 python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
 {"exact_match": 86.91579943235573, "f1": 93.1532499015869}
@@ -1382,6 +1416,7 @@ python -m torch.distributed.launch --nproc_per_node=8  run_bert_squad.py  --bert
 ```
 
 Training with these hyper-parameters gave us the following results:
+
 ```bash
 python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
 {"exact_match": 84.18164616840113, "f1": 91.58645594850135}
@@ -1409,7 +1444,8 @@ python run_bert_swag.py \
 ```
 
 Training with the previous hyper-parameters on a single GPU gave us the following results:
-```
+
+```bash
 eval_accuracy = 0.8062081375587323
 eval_loss = 0.5966546792367169
 global_step = 13788
@@ -1422,7 +1458,6 @@ The data should be a text file in the same format as [sample_text.txt](./samples
 You can download an [exemplary training corpus](https://ext-bert-sample.obs.eu-de.otc.t-systems.com/small_wiki_sentence_corpus.txt) generated from wikipedia articles and splitted into ~500k sentences with spaCy.
 Training one epoch on this corpus takes about 1:20h on 4 x NVIDIA Tesla P100 with `train_batch_size=200` and `max_seq_length=128`:
 
-
 Thank to the work of @Rocketknight1 and @tholor there are now **several scripts** that can be used to fine-tune BERT using the pretraining objective (combination of masked-language modeling and next sentence prediction loss). These scripts are detailed in the [`README`](./examples/lm_finetuning/README.md) of the [`examples/lm_finetuning/`](./examples/lm_finetuning/) folder.
 
 ### OpenAI GPT, Transformer-XL and GPT-2: running the examples
@@ -1471,11 +1506,13 @@ This command runs in about 1 min on a V100 and gives an evaluation perplexity of
 This example code is identical to the original unconditional and conditional generation codes.
 
 Conditional generation:
+
 ```shell
 python run_gpt2.py
 ```
 
 Unconditional generation:
+
 ```shell
 python run_gpt2.py --unconditional
 ```
@@ -1487,15 +1524,19 @@ The same option as in the original scripts are provided, please refere to the co
 The options we list above allow to fine-tune BERT-large rather easily on GPU(s) instead of the TPU used by the original implementation.
 
 For example, fine-tuning BERT-large on SQuAD can be done on a server with 4 k-80 (these are pretty old now) in 18 hours. Our results are similar to the TensorFlow implementation results (actually slightly higher):
+
 ```bash
 {"exact_match": 84.56953642384106, "f1": 91.04028647786927}
 ```
+
 To get these results we used a combination of:
+
 - multi-GPU training (automatically activated on a multi-GPU server),
 - 2 steps of gradient accumulation and
 - perform the optimization step on CPU to store Adam's averages in RAM.
 
 Here is the full list of hyper-parameters for this run:
+
 ```bash
 export SQUAD_DIR=/path/to/SQUAD
 
@@ -1518,6 +1559,7 @@ python ./run_bert_squad.py \
 If you have a recent GPU (starting from NVIDIA Volta series), you should try **16-bit fine-tuning** (FP16).
 
 Here is an example of hyper-parameters for a FP16 run we tried:
+
 ```bash
 export SQUAD_DIR=/path/to/SQUAD
 
@@ -1539,6 +1581,7 @@ python ./run_bert_squad.py \
 ```
 
 The results were similar to the above FP32 results (actually slightly higher):
+
 ```bash
 {"exact_match": 84.65468306527909, "f1": 91.238669287002}
 ```
@@ -1565,7 +1608,7 @@ python -m torch.distributed.launch --nproc_per_node=8 \
 
 ## Fine-tuning XLNet
 
-#### STS-B
+### STS-B
 
 This example code fine-tunes XLNet on the STS-B corpus.
 
@@ -1592,7 +1635,8 @@ python run_xlnet_classifier.py \
 
 Our test ran on a few seeds with [the original implementation hyper-parameters](https://github.com/zihangdai/xlnet#1-sts-b-sentence-pair-relevance-regression-with-gpus) gave evaluation results between 84% and 88%.
 
-**Distributed training**
+### Distributed training
+
 Here is an example using distributed training on 8 V100 GPUs to reach XXXX:
 
 ```bash
@@ -1611,6 +1655,7 @@ python -m torch.distributed.launch --nproc_per_node 8 \
 ```
 
 Training with these hyper-parameters gave us the following results:
+
 ```bash
   acc = 0.8823529411764706
   acc_and_f1 = 0.901702786377709
@@ -1646,15 +1691,15 @@ This is the example of the `bert-large-uncased-whole-word-masking-finetuned-mnli
 
 There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
 
-- BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950
-- Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
-- What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341
+- [BERT Rediscovers the Classical NLP Pipeline](https://arxiv.org/abs/1905.05950) by Ian Tenney, Dipanjan Das, Ellie Pavlick
+- [Are Sixteen Heads Really Better than One?](https://arxiv.org/abs/1905.10650) by Paul Michel, Omer Levy, Graham Neubig
+- [What Does BERT Look At? An Analysis of BERT's Attention](https://arxiv.org/abs/1906.04341) by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning
 
-In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
+In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of [Michel et al.](https://arxiv.org/abs/1905.10650):
 
 - accessing all the hidden-states of BERT/GPT/GPT-2,
 - accessing all the attention weights for each head of BERT/GPT/GPT-2,
-- retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
+- retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in [Michel et al.](https://arxiv.org/abs/1905.10650).
 
 To help you understand and use these features, we have added a specific example script: [`bertology.py`](./examples/bertology.py) while extract information and prune a model pre-trained on MRPC.
 
@@ -1674,7 +1719,7 @@ Please follow the instructions given in the notebooks to run and modify them.
 
 A command-line interface is provided to convert a TensorFlow checkpoint in a PyTorch dump of the `BertForPreTraining` class  (for BERT) or NumPy checkpoint in a PyTorch dump of the `OpenAIGPTModel` class  (for OpenAI GPT).
 
-### BERT
+### BERT CLI
 
 You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`convert_tf_checkpoint_to_pytorch.py`](./pytorch_transformers/convert_tf_checkpoint_to_pytorch.py ) script.
 
@@ -1697,7 +1742,7 @@ pytorch_transformers bert \
 
 You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/bert#pre-trained-models).
 
-### OpenAI GPT
+### OpenAI GPT CLI
 
 Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint save as the same format than OpenAI pretrained model (see [here](https://github.com/openai/finetune-transformer-lm))
 
@@ -1710,7 +1755,7 @@ pytorch_transformers gpt \
   [OPENAI_GPT_CONFIG]
 ```
 
-### Transformer-XL
+### Transformer-XL CLI
 
 Here is an example of the conversion process for a pre-trained Transformer-XL model (see [here](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models))
 
@@ -1751,7 +1796,6 @@ pytorch_transformers xlnet \
   STS-B \
 ```
 
-
 ## TPU
 
 TPU support and pretraining scripts
diff --git a/setup.py b/setup.py
index 09b8c01ad5..fcb70fbc0b 100644
--- a/setup.py
+++ b/setup.py
@@ -41,10 +41,10 @@ setup(
     version="0.7.0",
     author="Thomas Wolf, Lysandre Debut, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
     author_email="thomas@huggingface.co",
-    description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
+    description="Repository of pre-trained NLP Transformer models: BERT, GPT & GPT-2, Transformer-XL, XLNet and XLM",
     long_description=open("README.md", "r", encoding='utf-8').read(),
     long_description_content_type="text/markdown",
-    keywords='BERT NLP deep learning google',
+    keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU',
     license='Apache',
     url="https://github.com/huggingface/pytorch-transformers",
     packages=find_packages(exclude=["*.tests", "*.tests.*",

From a4f980547f298f0d3a2cdfe6da919294c967cd06 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 12:31:34 +0200
Subject: [PATCH 04/17] remove circle ci parallelism

---
 .circleci/config.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index ac23723f98..996eab6815 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -12,7 +12,6 @@ jobs:
             - run: sudo python -m spacy download en
             - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
             - run: codecov
-        parallelism: 4
     build_py2:
         working_directory: ~/pytorch-transformers
         docker:
@@ -25,7 +24,6 @@ jobs:
             - run: sudo python -m spacy download en
             - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
             - run: codecov
-        parallelism: 4
 workflows:
   version: 2
   build_and_test:

From 36bca545ff1c13eb7af710d38af4270ef6a965ed Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 15:02:59 +0200
Subject: [PATCH 05/17] tokenization abstract class - tests for examples

---
 examples/run_squad.py                         | 400 ++++++++++++++++++
 examples/test_examples.py                     |  48 +++
 pytorch_transformers/__init__.py              |   6 +-
 pytorch_transformers/modeling_bert.py         |   2 +-
 pytorch_transformers/modeling_gpt2.py         |   2 +-
 pytorch_transformers/modeling_openai.py       |   2 +-
 pytorch_transformers/modeling_transfo_xl.py   |   2 +-
 .../{model_utils.py => modeling_utils.py}     |   6 -
 pytorch_transformers/modeling_xlm.py          |   2 +-
 pytorch_transformers/modeling_xlnet.py        |   2 +-
 .../tests/model_utils_test.py                 |  50 ---
 .../tests/modeling_bert_test.py               |   2 +-
 .../tests/modeling_gpt2_test.py               |   2 +-
 .../tests/modeling_openai_test.py             |   2 +-
 ...s_commons.py => modeling_tests_commons.py} |   0
 .../tests/modeling_transfo_xl_test.py         |   2 +-
 .../tests/modeling_utils_test.py              |   9 +-
 .../tests/modeling_xlm_test.py                |   2 +-
 .../tests/modeling_xlnet_test.py              |   2 +-
 .../tests/tokenization_bert_test.py           |  10 +-
 .../tests/tokenization_gpt2_test.py           |  11 +-
 .../tests/tokenization_openai_test.py         |  10 +-
 .../tests/tokenization_transfo_xl_test.py     |   9 +-
 .../tests/tokenization_utils_test.py          |  36 ++
 .../tests/tokenization_xlm_test.py            |  12 +-
 .../tests/tokenization_xlnet_test.py          |  12 +-
 pytorch_transformers/tokenization_bert.py     |  66 +--
 pytorch_transformers/tokenization_gpt2.py     | 117 ++---
 pytorch_transformers/tokenization_openai.py   | 110 ++---
 .../tokenization_transfo_xl.py                |  78 ++--
 pytorch_transformers/tokenization_utils.py    | 114 +++++
 pytorch_transformers/tokenization_xlm.py      | 122 ++----
 pytorch_transformers/tokenization_xlnet.py    | 131 ++----
 33 files changed, 815 insertions(+), 566 deletions(-)
 create mode 100644 examples/run_squad.py
 create mode 100644 examples/test_examples.py
 rename pytorch_transformers/{model_utils.py => modeling_utils.py} (98%)
 delete mode 100644 pytorch_transformers/tests/model_utils_test.py
 rename pytorch_transformers/tests/{model_tests_commons.py => modeling_tests_commons.py} (100%)
 rename examples/tests/examples_tests.py => pytorch_transformers/tests/modeling_utils_test.py (92%)
 create mode 100644 pytorch_transformers/tests/tokenization_utils_test.py
 create mode 100644 pytorch_transformers/tokenization_utils.py

diff --git a/examples/run_squad.py b/examples/run_squad.py
new file mode 100644
index 0000000000..d6d7279cb8
--- /dev/null
+++ b/examples/run_squad.py
@@ -0,0 +1,400 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Run BERT on SQuAD."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import os
+import random
+import sys
+from io import open
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+from tensorboardX import SummaryWriter
+
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForQuestionAnswering
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers.tokenization_bert import BertTokenizer
+
+from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--bert_model", default=None, type=str, required=True,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
+                        "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model checkpoints and predictions will be written.")
+
+    ## Other parameters
+    parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
+    parser.add_argument("--predict_file", default=None, type=str,
+                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    parser.add_argument("--max_seq_length", default=384, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+                             "longer than this will be truncated, and sequences shorter than this will be padded.")
+    parser.add_argument("--doc_stride", default=128, type=int,
+                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
+    parser.add_argument("--max_query_length", default=64, type=int,
+                        help="The maximum number of tokens for the question. Questions longer than this will "
+                             "be truncated to this length.")
+    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
+    parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
+    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
+    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--warmup_proportion", default=0.1, type=float,
+                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
+                             "of training.")
+    parser.add_argument("--n_best_size", default=20, type=int,
+                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
+                             "output file.")
+    parser.add_argument("--max_answer_length", default=30, type=int,
+                        help="The maximum length of an answer that can be generated. This is needed because the start "
+                             "and end predictions are not conditioned on one another.")
+    parser.add_argument("--verbose_logging", action='store_true',
+                        help="If true, all of the warnings related to data processing will be printed. "
+                             "A number of warnings are expected for a normal SQuAD evaluation.")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+    parser.add_argument('--gradient_accumulation_steps',
+                        type=int,
+                        default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--fp16',
+                        action='store_true',
+                        help="Whether to use 16-bit float precision instead of 32-bit")
+    parser.add_argument('--overwrite_output_dir',
+                        action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--loss_scale',
+                        type=float, default=0,
+                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
+                             "0 (default value): dynamic loss scaling.\n"
+                             "Positive power of 2: static loss scaling value.\n")
+    parser.add_argument('--version_2_with_negative',
+                        action='store_true',
+                        help='If true, the SQuAD examples contain some that do not have an answer.')
+    parser.add_argument('--null_score_diff_threshold',
+                        type=float, default=0.0,
+                        help="If null_score - best_non_null is greater than the threshold predict null.")
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+    print(args)
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl')
+
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        device, n_gpu, bool(args.local_rank != -1), args.fp16))
+
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                            args.gradient_accumulation_steps))
+
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    if not args.do_train and not args.do_predict:
+        raise ValueError("At least one of `do_train` or `do_predict` must be True.")
+
+    if args.do_train:
+        if not args.train_file:
+            raise ValueError(
+                "If `do_train` is True, then `train_file` must be specified.")
+    if args.do_predict:
+        if not args.predict_file:
+            raise ValueError(
+                "If `do_predict` is True, then `predict_file` must be specified.")
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory {} already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
+    model = BertForQuestionAnswering.from_pretrained(args.bert_model)
+    if args.local_rank == 0:
+        torch.distributed.barrier()
+
+    if args.fp16:
+        model.half()
+    model.to(device)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model,
+                                                          device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    if args.do_train:
+        if args.local_rank in [-1, 0]:
+            tb_writer = SummaryWriter()
+        # Prepare data loader
+        train_examples = read_squad_examples(
+            input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
+        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
+            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
+        try:
+            with open(cached_train_features_file, "rb") as reader:
+                train_features = pickle.load(reader)
+        except:
+            train_features = convert_examples_to_features(
+                examples=train_examples,
+                tokenizer=tokenizer,
+                max_seq_length=args.max_seq_length,
+                doc_stride=args.doc_stride,
+                max_query_length=args.max_query_length,
+                is_training=True)
+            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+                logger.info("  Saving train features into cached file %s", cached_train_features_file)
+                with open(cached_train_features_file, "wb") as writer:
+                    pickle.dump(train_features, writer)
+
+        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
+        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
+        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                   all_start_positions, all_end_positions)
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+        # if args.local_rank != -1:
+        #     num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
+
+        # Prepare optimizer
+        param_optimizer = list(model.named_parameters())
+
+        # hack to remove pooler, which is not used
+        # thus it produce None grad that break apex
+        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
+
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+
+        if args.fp16:
+            try:
+                from apex.optimizers import FP16_Optimizer
+                from apex.optimizers import FusedAdam
+            except ImportError:
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+            optimizer = FusedAdam(optimizer_grouped_parameters,
+                                  lr=args.learning_rate,
+                                  bias_correction=False,
+                                  max_grad_norm=1.0)
+            if args.loss_scale == 0:
+                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+            else:
+                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                                 t_total=num_train_optimization_steps)
+        else:
+            optimizer = BertAdam(optimizer_grouped_parameters,
+                                 lr=args.learning_rate,
+                                 warmup=args.warmup_proportion,
+                                 t_total=num_train_optimization_steps)
+
+        global_step = 0
+
+        logger.info("***** Running training *****")
+        logger.info("  Num orig examples = %d", len(train_examples))
+        logger.info("  Num split examples = %d", len(train_features))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
+
+        model.train()
+        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+                if n_gpu == 1:
+                    batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
+                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
+                loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
+                if n_gpu > 1:
+                    loss = loss.mean() # mean() to average on multi-gpu.
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+
+                if args.fp16:
+                    optimizer.backward(loss)
+                else:
+                    loss.backward()
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    if args.fp16:
+                        # modify learning rate with special warm up BERT uses
+                        # if args.fp16 is False, BertAdam is used and handles this automatically
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
+                        for param_group in optimizer.param_groups:
+                            param_group['lr'] = lr_this_step
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+                    if args.local_rank in [-1, 0]:
+                        if not args.fp16:
+                            tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                        tb_writer.add_scalar('loss', loss.item(), global_step)
+
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Save a trained model, configuration and tokenizer
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+
+        torch.save(model_to_save.state_dict(), output_model_file)
+        model_to_save.config.to_json_file(output_config_file)
+        tokenizer.save_vocabulary(args.output_dir)
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
+        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+
+        # Good practice: save your training arguments together with the trained model
+        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
+        torch.save(args, output_args_file)
+    else:
+        model = BertForQuestionAnswering.from_pretrained(args.bert_model)
+
+    model.to(device)
+
+    if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        eval_examples = read_squad_examples(
+            input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
+        eval_features = convert_examples_to_features(
+            examples=eval_examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=False)
+
+        logger.info("***** Running predictions *****")
+        logger.info("  Num orig examples = %d", len(eval_examples))
+        logger.info("  Num split examples = %d", len(eval_features))
+        logger.info("  Batch size = %d", args.predict_batch_size)
+
+        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
+        # Run prediction for full data
+        eval_sampler = SequentialSampler(eval_data)
+        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
+
+        model.eval()
+        all_results = []
+        logger.info("Start evaluating")
+        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]):
+            if len(all_results) % 1000 == 0:
+                logger.info("Processing example: %d" % (len(all_results)))
+            input_ids = input_ids.to(device)
+            input_mask = input_mask.to(device)
+            segment_ids = segment_ids.to(device)
+            with torch.no_grad():
+                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
+            for i, example_index in enumerate(example_indices):
+                start_logits = batch_start_logits[i].detach().cpu().tolist()
+                end_logits = batch_end_logits[i].detach().cpu().tolist()
+                eval_feature = eval_features[example_index.item()]
+                unique_id = int(eval_feature.unique_id)
+                all_results.append(RawResult(unique_id=unique_id,
+                                             start_logits=start_logits,
+                                             end_logits=end_logits))
+        output_prediction_file = os.path.join(args.output_dir, "predictions.json")
+        output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
+        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
+        write_predictions(eval_examples, eval_features, all_results,
+                          args.n_best_size, args.max_answer_length,
+                          args.do_lower_case, output_prediction_file,
+                          output_nbest_file, output_null_log_odds_file, args.verbose_logging,
+                          args.version_2_with_negative, args.null_score_diff_threshold)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/test_examples.py b/examples/test_examples.py
new file mode 100644
index 0000000000..fada43dae2
--- /dev/null
+++ b/examples/test_examples.py
@@ -0,0 +1,48 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc..
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import unittest
+import argparse
+
+try:
+    # python 3.4+ can use builtin unittest.mock instead of mock package
+    from unittest.mock import patch
+except ImportError:
+    from mock import patch
+
+import run_bert_squad as rbs
+
+def get_setup_file():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f')
+    args = parser.parse_args()
+    return args.f
+
+class ExamplesTests(unittest.TestCase):
+
+    def test_run_squad(self):
+        testargs = ["prog", "-f", "/home/test/setup.py"]
+        with patch.object(sys, 'argv', testargs):
+            setup = get_setup_file()
+            assert setup == "/home/test/setup.py"
+            # rbs.main()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index cbd007f872..6dd78dfd02 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -5,6 +5,7 @@ from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
+from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization)
 
 from .modeling_bert import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
@@ -26,11 +27,10 @@ from .modeling_xlnet import (XLNetConfig,
 from .modeling_xlm import (XLMConfig, XLMModel,
                            XLMWithLMHeadModel, XLMForSequenceClassification,
                            XLMForQuestionAnswering)
+from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
+                          PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
 
 from .optimization import BertAdam
 from .optimization_openai import OpenAIAdam
 
 from .file_utils import (PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
-
-from .model_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
-                          PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index d4967b3718..b2a456209d 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -29,7 +29,7 @@ from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import cached_path
-from .model_utils import WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel, prune_linear_layer
+from .modeling_utils import WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel, prune_linear_layer
 
 logger = logging.getLogger(__name__)
 
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index c16ad2f763..090763cda1 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -31,7 +31,7 @@ from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
 from .file_utils import cached_path
-from .model_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
+from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
                           PreTrainedModel, prune_conv1d_layer, SequenceSummary)
 from .modeling_bert import BertLayerNorm as LayerNorm
 
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 1a3e7fbbb4..b715b18371 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -31,7 +31,7 @@ from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
 from .file_utils import cached_path
-from .model_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
+from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
                           PreTrainedModel, prune_conv1d_layer, SequenceSummary)
 from .modeling_bert import BertLayerNorm as LayerNorm
 
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 9a882bce96..465577b002 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -37,7 +37,7 @@ from torch.nn.parameter import Parameter
 from .modeling_bert import BertLayerNorm as LayerNorm
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
 from .file_utils import cached_path
-from .model_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
+from .modeling_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
 
 logger = logging.getLogger(__name__)
 
diff --git a/pytorch_transformers/model_utils.py b/pytorch_transformers/modeling_utils.py
similarity index 98%
rename from pytorch_transformers/model_utils.py
rename to pytorch_transformers/modeling_utils.py
index 051fbdefbc..b72707ce08 100644
--- a/pytorch_transformers/model_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -598,9 +598,3 @@ def prune_layer(layer, index, dim=None):
         return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
     else:
         raise ValueError("Can't prune layer of class {}".format(layer.__class__))
-
-def clean_up_tokenization(out_string):
-    out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
-                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
-                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
-    return out_string
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 6decba3cce..14f8848a42 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -35,7 +35,7 @@ from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import cached_path
-from .model_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
+from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
                           prune_linear_layer, SequenceSummary, SQuADHead)
 
 logger = logging.getLogger(__name__)
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index f5841e0601..289dcbd9db 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -32,7 +32,7 @@ from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import cached_path
-from .model_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
+from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
                           SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits)
 
 
diff --git a/pytorch_transformers/tests/model_utils_test.py b/pytorch_transformers/tests/model_utils_test.py
deleted file mode 100644
index 120df35f82..0000000000
--- a/pytorch_transformers/tests/model_utils_test.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# coding=utf-8
-# Copyright 2018 HuggingFace Inc..
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import unittest
-import json
-import random
-import shutil
-import pytest
-
-import torch
-
-from pytorch_transformers import PretrainedConfig, PreTrainedModel
-from pytorch_transformers.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP, PRETRAINED_CONFIG_ARCHIVE_MAP
-
-
-class ModelUtilsTest(unittest.TestCase):
-    def test_model_from_pretrained(self):
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            config = BertConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, PretrainedConfig)
-
-            model = BertModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, PreTrainedModel)
-
-            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-            self.assertEqual(model.config.output_attentions, True)
-            self.assertEqual(model.config.output_hidden_states, True)
-            self.assertEqual(model.config, config)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/pytorch_transformers/tests/modeling_bert_test.py b/pytorch_transformers/tests/modeling_bert_test.py
index b140f5e647..2ba59317be 100644
--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -26,7 +26,7 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
                                      BertForTokenClassification, BertForMultipleChoice)
 from pytorch_transformers.modeling_bert import PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .model_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
 
 
 class BertModelTest(unittest.TestCase):
diff --git a/pytorch_transformers/tests/modeling_gpt2_test.py b/pytorch_transformers/tests/modeling_gpt2_test.py
index 4ace52571a..7400c9f64d 100644
--- a/pytorch_transformers/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -28,7 +28,7 @@ import torch
 from pytorch_transformers import (GPT2Config, GPT2Model,
                                      GPT2LMHeadModel, GPT2DoubleHeadsModel)
 
-from .model_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
 
 class GPT2ModelTest(unittest.TestCase):
 
diff --git a/pytorch_transformers/tests/modeling_openai_test.py b/pytorch_transformers/tests/modeling_openai_test.py
index fe81157023..27263ecb24 100644
--- a/pytorch_transformers/tests/modeling_openai_test.py
+++ b/pytorch_transformers/tests/modeling_openai_test.py
@@ -24,7 +24,7 @@ import torch
 from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
                                      OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
 
-from .model_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
 
 class OpenAIModelTest(unittest.TestCase):
 
diff --git a/pytorch_transformers/tests/model_tests_commons.py b/pytorch_transformers/tests/modeling_tests_commons.py
similarity index 100%
rename from pytorch_transformers/tests/model_tests_commons.py
rename to pytorch_transformers/tests/modeling_tests_commons.py
diff --git a/pytorch_transformers/tests/modeling_transfo_xl_test.py b/pytorch_transformers/tests/modeling_transfo_xl_test.py
index d15a19eb64..f2906d879f 100644
--- a/pytorch_transformers/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -28,7 +28,7 @@ import torch
 from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
 from pytorch_transformers.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .model_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
 
 class TransfoXLModelTest(unittest.TestCase):
     class TransfoXLModelTester(object):
diff --git a/examples/tests/examples_tests.py b/pytorch_transformers/tests/modeling_utils_test.py
similarity index 92%
rename from examples/tests/examples_tests.py
rename to pytorch_transformers/tests/modeling_utils_test.py
index 120df35f82..1866d35353 100644
--- a/examples/tests/examples_tests.py
+++ b/pytorch_transformers/tests/modeling_utils_test.py
@@ -16,17 +16,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import unittest
-import json
-import random
-import shutil
-import pytest
-
-import torch
 
 from pytorch_transformers import PretrainedConfig, PreTrainedModel
-from pytorch_transformers.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP, PRETRAINED_CONFIG_ARCHIVE_MAP
+from pytorch_transformers.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP
 
 
 class ModelUtilsTest(unittest.TestCase):
diff --git a/pytorch_transformers/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py
index 8a8905cc31..9c511f21a8 100644
--- a/pytorch_transformers/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -23,7 +23,7 @@ import pytest
 from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
 from pytorch_transformers.modeling_xlm import PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .model_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
 
 
 class XLMModelTest(unittest.TestCase):
diff --git a/pytorch_transformers/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py
index b9d55a26c7..b762426d2c 100644
--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -28,7 +28,7 @@ import torch
 from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
 from pytorch_transformers.modeling_xlnet import PRETRAINED_MODEL_ARCHIVE_MAP
 
-from .model_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
 
 class XLNetModelTest(unittest.TestCase):
     class XLNetModelTester(object):
diff --git a/pytorch_transformers/tests/tokenization_bert_test.py b/pytorch_transformers/tests/tokenization_bert_test.py
index 59a87a4cb9..37e20cc286 100644
--- a/pytorch_transformers/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -24,7 +24,7 @@ from pytorch_transformers.tokenization_bert import (BasicTokenizer,
                                                   BertTokenizer,
                                                   WordpieceTokenizer,
                                                   _is_control, _is_punctuation,
-                                                  _is_whitespace, PRETRAINED_VOCAB_ARCHIVE_MAP)
+                                                  _is_whitespace)
 
 from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -49,14 +49,6 @@ class TokenizationTest(unittest.TestCase):
 
         os.remove(vocab_file)
 
-    @pytest.mark.slow
-    def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
-            tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(tokenizer)
-
     def test_chinese(self):
         tokenizer = BasicTokenizer()
 
diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
index c6d926bdd4..8b06161b53 100644
--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -17,10 +17,8 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
-import shutil
-import pytest
 
-from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
 
 from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -56,13 +54,6 @@ class GPT2TokenizationTest(unittest.TestCase):
         os.remove(vocab_file)
         os.remove(merges_file)
 
-    # @pytest.mark.slow
-    def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
-            tokenizer = GPT2Tokenizer.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(tokenizer)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_openai_test.py b/pytorch_transformers/tests/tokenization_openai_test.py
index 38315f927b..3f8c49f888 100644
--- a/pytorch_transformers/tests/tokenization_openai_test.py
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -20,7 +20,7 @@ import json
 import shutil
 import pytest
 
-from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -58,14 +58,6 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
-    @pytest.mark.slow
-    def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
-            tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(tokenizer)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_transfo_xl_test.py b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
index f744e319c8..f583e30b56 100644
--- a/pytorch_transformers/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -20,7 +20,7 @@ from io import open
 import shutil
 import pytest
 
-from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -59,13 +59,6 @@ class TransfoXLTokenizationTest(unittest.TestCase):
             tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
             ["HeLLo", "!", "how", "Are", "yoU", "?"])
 
-    @pytest.mark.slow
-    def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
-            tokenizer = TransfoXLTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(tokenizer)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_utils_test.py b/pytorch_transformers/tests/tokenization_utils_test.py
new file mode 100644
index 0000000000..e8856d50c2
--- /dev/null
+++ b/pytorch_transformers/tests/tokenization_utils_test.py
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc..
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+from pytorch_transformers import PreTrainedTokenizer
+from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
+
+class TokenizerUtilsTest(unittest.TestCase):
+    def check_tokenizer_from_pretrained(self, tokenizer_class):
+        s3_models = list(tokenizer_class.max_model_input_sizes.keys())
+        for model_name in s3_models[:1]:
+            tokenizer = tokenizer_class.from_pretrained(model_name)
+            self.assertIsNotNone(tokenizer)
+            self.assertIsInstance(tokenizer, PreTrainedTokenizer)
+
+    def test_pretrained_tokenizers(self):
+        self.check_tokenizer_from_pretrained(GPT2Tokenizer)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_xlm_test.py b/pytorch_transformers/tests/tokenization_xlm_test.py
index 9cc18f3d60..00d273a628 100644
--- a/pytorch_transformers/tests/tokenization_xlm_test.py
+++ b/pytorch_transformers/tests/tokenization_xlm_test.py
@@ -20,9 +20,9 @@ import json
 import shutil
 import pytest
 
-from pytorch_transformers.tokenization_xlm import XLMTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
+from pytorch_transformers.tokenization_xlm import XLMTokenizer
 
-from.tokenization_tests_commons import create_and_check_tokenizer_commons
+from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
 class XLMTokenizationTest(unittest.TestCase):
 
@@ -57,14 +57,6 @@ class XLMTokenizationTest(unittest.TestCase):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
-    @pytest.mark.slow
-    def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
-            tokenizer = XLMTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(tokenizer)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_xlnet_test.py b/pytorch_transformers/tests/tokenization_xlnet_test.py
index 4dd76e114b..6e81f214b7 100644
--- a/pytorch_transformers/tests/tokenization_xlnet_test.py
+++ b/pytorch_transformers/tests/tokenization_xlnet_test.py
@@ -19,9 +19,7 @@ import unittest
 import shutil
 import pytest
 
-from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer,
-                                                        PRETRAINED_VOCAB_ARCHIVE_MAP,
-                                                        SPIECE_UNDERLINE)
+from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -60,14 +58,6 @@ class XLNetTokenizationTest(unittest.TestCase):
                                            SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
                                            u'<unk>', u'.'])
 
-    @pytest.mark.slow
-    def test_tokenizer_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
-            tokenizer = XLNetTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(tokenizer)
-
     def test_tokenizer_lower(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index c8db62b9c0..b26e5066e9 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -23,11 +23,15 @@ import unicodedata
 from io import open
 
 from .file_utils import cached_path
-from .model_utils import clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
+VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
@@ -41,8 +45,9 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
-}
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+}}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'bert-base-uncased': 512,
     'bert-large-uncased': 512,
     'bert-base-cased': 512,
@@ -57,7 +62,6 @@ PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
     'bert-large-cased-whole-word-masking-finetuned-squad': 512,
     'bert-base-cased-finetuned-mrpc': 512,
 }
-VOCAB_NAME = 'vocab.txt'
 
 def load_vocab(vocab_file):
     """Loads a vocabulary file into a dictionary."""
@@ -83,8 +87,11 @@ def whitespace_tokenize(text):
     return tokens
 
 
-class BertTokenizer(object):
+class BertTokenizer(PreTrainedTokenizer):
     """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
     def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
                  never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
@@ -203,7 +210,7 @@ class BertTokenizer(object):
         """Save the tokenizer vocabulary to a directory or file."""
         index = 0
         if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
         with open(vocab_file, "w", encoding="utf-8") as writer:
             for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                 if index != token_index:
@@ -215,13 +222,10 @@ class BertTokenizer(object):
         return (vocab_file,)
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        """ Instantiate a BertTokenizer from pre-trained vocabulary files.
         """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+        if pretrained_model_name_or_path in PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES:
             if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
                 logger.warning("The pre-trained model you are loading is a cased model but you have not set "
                                "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
@@ -232,40 +236,8 @@ class BertTokenizer(object):
                                "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
                                "but you may want to check this behavior.")
                 kwargs['do_lower_case'] = True
-        else:
-            vocab_file = pretrained_model_name_or_path
-        if os.path.isdir(vocab_file):
-            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download vocabulary.".format(
-                        vocab_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find any file "
-                    "associated to this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                        vocab_file))
-            return None
-        if resolved_vocab_file == vocab_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
-            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
-            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
-        return tokenizer
+
+        return super(BertTokenizer, cls)._from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 
 
 class BasicTokenizer(object):
diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index 2947ce66b8..abdfe39c1c 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -23,8 +23,6 @@ import os
 import regex as re
 from io import open
 
-from .model_utils import clean_up_tokenization
-
 try:
     from functools import lru_cache
 except ImportError:
@@ -33,24 +31,38 @@ except ImportError:
     def lru_cache():
         return lambda func: func
 
-from .file_utils import cached_path
+from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
-    'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'vocab.json',
+    'merges_file': 'merges.txt',
+    'special_tokens_file': 'special_tokens.txt'
 }
-PRETRAINED_MERGES_ARCHIVE_MAP = {
-    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
-    'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+        'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
+    },
+    'merges_file':
+    {
+        'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+        'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
+    },
+    'special_tokens_file':
+    {
+        'gpt2': None,
+        'gpt2-medium': None,
+    }
 }
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'gpt2': 1024,
+    'gpt2-medium': 1024,
 }
-VOCAB_NAME = 'vocab.json'
-MERGES_NAME = 'merges.txt'
-SPECIAL_TOKENS_NAME = 'special_tokens.txt'
 
 @lru_cache()
 def bytes_to_unicode():
@@ -87,70 +99,16 @@ def get_pairs(word):
         prev_char = char
     return pairs
 
-class GPT2Tokenizer(object):
+class GPT2Tokenizer(PreTrainedTokenizer):
     """
     GPT-2 BPE tokenizer. Peculiarities:
         - Byte-level BPE
     """
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
-        """
-        Instantiate a GPT2Tokenizer from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
-            special_tokens_file = None
-        else:
-            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
-            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
-            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
-            if not os.path.exists(special_tokens_file):
-                special_tokens_file = None
-            else:
-                logger.info("loading special tokens file {}".format(special_tokens_file))
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download vocabulary.".format(
-                        vocab_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                        pretrained_model_name_or_path,
-                        vocab_file, merges_file))
-            return None
-        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-            logger.info("loading merges file {}".format(merges_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-            logger.info("loading merges file {} from cache at {}".format(
-                merges_file, resolved_merges_file))
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
-            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
-            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        if special_tokens_file and 'special_tokens' not in kwargs:
-            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-        else:
-            special_tokens = kwargs.pop('special_tokens', [])
-        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
-        return tokenizer
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):
+    def __init__(self, vocab_file, merges_file, special_tokens_file=None, special_tokens=None, errors='replace', max_len=None):
         self.max_len = max_len if max_len is not None else int(1e12)
         self.encoder = json.load(open(vocab_file))
         self.decoder = {v:k for k,v in self.encoder.items()}
@@ -165,9 +123,16 @@ class GPT2Tokenizer(object):
         # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
+        all_special_tokens = []
+        if special_tokens_file is not None:
+            special_tokens_to_add = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+            all_special_tokens.extend(special_tokens_to_add)
+        if special_tokens is not None and special_tokens:
+            all_special_tokens.extend(special_tokens)
+
         self.special_tokens = {}
         self.special_tokens_decoder = {}
-        self.set_special_tokens(special_tokens)
+        self.set_special_tokens(all_special_tokens)
 
     def __len__(self):
         return len(self.encoder) + len(self.special_tokens)
@@ -285,9 +250,9 @@ class GPT2Tokenizer(object):
         if not os.path.isdir(vocab_path):
             logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
             return
-        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
-        merge_file = os.path.join(vocab_path, MERGES_NAME)
-        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+        vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['merges_file'])
+        special_tokens_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['special_tokens_file'])
 
         with open(vocab_file, 'w', encoding='utf-8') as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
diff --git a/pytorch_transformers/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py
index 7d005a8260..419dfdad92 100644
--- a/pytorch_transformers/tokenization_openai.py
+++ b/pytorch_transformers/tokenization_openai.py
@@ -26,23 +26,35 @@ from io import open
 from tqdm import tqdm
 
 from .file_utils import cached_path
-from .model_utils import clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 from .tokenization_bert import BasicTokenizer
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'vocab.json',
+    'merges_file': 'merges.txt',
+    'special_tokens_file': 'special_tokens.txt'
 }
-PRETRAINED_MERGES_ARCHIVE_MAP = {
-    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
+    },
+    'merges_file':
+    {
+        'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
+    },
+    'special_tokens_file':
+    {
+        'openai-gpt': None,
+    }
 }
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'openai-gpt': 512,
 }
-VOCAB_NAME = 'vocab.json'
-MERGES_NAME = 'merges.txt'
-SPECIAL_TOKENS_NAME = 'special_tokens.txt'
 
 def get_pairs(word):
     """
@@ -71,7 +83,7 @@ def text_standardize(text):
     text = re.sub(r'[^\S\n]+', ' ', text)
     return text.strip()
 
-class OpenAIGPTTokenizer(object):
+class OpenAIGPTTokenizer(PreTrainedTokenizer):
     """
     BPE tokenizer. Peculiarities:
         - lower case all inputs
@@ -79,65 +91,11 @@ class OpenAIGPTTokenizer(object):
         - argument special_tokens and function set_special_tokens:
             can be used to add additional symbols (ex: "__classify__") to a vocabulary.
     """
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
-        """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
-            special_tokens_file = None
-        else:
-            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
-            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
-            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
-            if not os.path.exists(special_tokens_file):
-                special_tokens_file = None
-            else:
-                logger.info("loading special tokens file {}".format(special_tokens_file))
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download vocabulary.".format(
-                        vocab_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                        pretrained_model_name_or_path,
-                        vocab_file, merges_file))
-            return None
-        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-            logger.info("loading merges file {}".format(merges_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-            logger.info("loading merges file {} from cache at {}".format(
-                merges_file, resolved_merges_file))
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
-            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
-            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        if special_tokens_file and 'special_tokens' not in kwargs:
-            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-        else:
-            special_tokens = kwargs.pop('special_tokens', [])
-        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
-        return tokenizer
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
+    def __init__(self, vocab_file, merges_file, special_tokens_file=None, special_tokens=None, max_len=None):
         try:
             import ftfy
             import spacy
@@ -156,9 +114,17 @@ class OpenAIGPTTokenizer(object):
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
+
+        all_special_tokens = []
+        if special_tokens_file is not None:
+            special_tokens_to_add = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+            all_special_tokens.extend(special_tokens_to_add)
+        if special_tokens is not None and special_tokens:
+            all_special_tokens.extend(special_tokens)
+
         self.special_tokens = {}
         self.special_tokens_decoder = {}
-        self.set_special_tokens(special_tokens)
+        self.set_special_tokens(all_special_tokens)
 
     def __len__(self):
         return len(self.encoder) + len(self.special_tokens)
@@ -286,9 +252,9 @@ class OpenAIGPTTokenizer(object):
         if not os.path.isdir(vocab_path):
             logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
             return
-        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
-        merge_file = os.path.join(vocab_path, MERGES_NAME)
-        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+        vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['merges_file'])
+        special_tokens_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['special_tokens_file'])
 
         with open(vocab_file, 'w', encoding='utf-8') as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py
index 7e83680770..a86c8fe460 100644
--- a/pytorch_transformers/tokenization_transfo_xl.py
+++ b/pytorch_transformers/tokenization_transfo_xl.py
@@ -31,7 +31,7 @@ import torch
 import numpy as np
 
 from .file_utils import cached_path
-from .model_utils import clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -41,66 +41,35 @@ else:
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
+VOCAB_FILES_NAMES = {'pretrained_vocab_file': 'vocab.bin'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'pretrained_vocab_file':
+    {
+        'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'transfo-xl-wt103': 512,
 }
-VOCAB_NAME = 'vocab.bin'
 
 PRETRAINED_CORPUS_ARCHIVE_MAP = {
     'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin",
 }
 CORPUS_NAME = 'corpus.bin'
 
-class TransfoXLTokenizer(object):
+class TransfoXLTokenizer(PreTrainedTokenizer):
     """
     Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
     """
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
-        """
-        Instantiate a TransfoXLTokenizer.
-        The TransfoXLTokenizer.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            if os.path.isdir(pretrained_model_name_or_path):
-                vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
-            else:
-                vocab_file = pretrained_model_name_or_path
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download vocabulary.".format(
-                        vocab_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                        pretrained_model_name_or_path,
-                        vocab_file))
-            return None
-        if resolved_vocab_file == vocab_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-
-        # Instantiate tokenizer.
-        tokenizer = cls(*inputs, **kwargs)
-        vocab_dict = torch.load(resolved_vocab_file)
-        for key, value in vocab_dict.items():
-            tokenizer.__dict__[key] = value
-        return tokenizer
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
     def __init__(self, special=[], min_freq=0, max_size=None, lower_case=False,
-                 delimiter=None, vocab_file=None, never_split=("<unk>", "<eos>", "<formula>")):
+                 delimiter=None, vocab_file=None, pretrained_vocab_file=None,
+                 never_split=("<unk>", "<eos>", "<formula>")):
         self.counter = Counter()
         self.special = special
         self.min_freq = min_freq
@@ -110,6 +79,13 @@ class TransfoXLTokenizer(object):
         self.vocab_file = vocab_file
         self.never_split = never_split
 
+        if pretrained_vocab_file is not None:
+            # Hack because, honestly this tokenizer was not made to be used
+            # in a library like ours, at all.
+            vocab_dict = torch.load(pretrained_vocab_file)
+            for key, value in vocab_dict.items():
+                self.__dict__[key] = value
+
         if vocab_file is not None:
             self.build_vocab()
 
@@ -157,7 +133,7 @@ class TransfoXLTokenizer(object):
         """Save the tokenizer vocabulary to a directory or file."""
         index = 0
         if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['pretrained_vocab_file'])
         torch.save(self.__dict__, vocab_file)
         return (vocab_file,)
 
@@ -484,7 +460,7 @@ class TransfoXLCorpus(object):
                 "We assumed '{}' was a path or url but couldn't find files {} "
                 "at this path or url.".format(
                     pretrained_model_name_or_path,
-                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    ', '.join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()),
                     pretrained_model_name_or_path,
                     corpus_file))
             return None
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
new file mode 100644
index 0000000000..98a2968539
--- /dev/null
+++ b/pytorch_transformers/tokenization_utils.py
@@ -0,0 +1,114 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import sys
+import json
+import logging
+import os
+import regex as re
+from io import open
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+
+class PreTrainedTokenizer(object):
+    """ An abstract class to handle dowloading and loading pretrained tokenizers.
+    """
+    vocab_files_names = {}
+    pretrained_vocab_files_map = {}
+    max_model_input_sizes = {}
+
+    @classmethod
+    def from_pretrained(cls, *inputs, **kwargs):
+        return cls._from_pretrained(*inputs, **kwargs)
+
+    @classmethod
+    def _from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedTokenizer from pre-trained vocabulary files.
+        Download and cache the vocabulary files if needed.
+        """
+        s3_models = list(cls.max_model_input_sizes.keys())
+        vocab_files = {}
+        if pretrained_model_name_or_path in s3_models:
+            for file_id, map_list in cls.pretrained_vocab_files_map.items():
+                vocab_files[file_id] = map_list[pretrained_model_name_or_path]
+        else:
+            for file_id, file_name in cls.vocab_files_names.items():
+                if os.path.isdir(pretrained_model_name_or_path):
+                    full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
+                else:
+                    full_file_name = pretrained_model_name_or_path
+                if not os.path.exists(full_file_name):
+                    logger.info("Didn't find file {}. We don't load it.".format(full_file_name))
+                    full_file_name = None
+                vocab_files[file_id] = full_file_name
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_files = {}
+            for file_id, file_path in vocab_files.items():
+                if file_path is None:
+                    resolved_vocab_files[file_id] = None
+                else:
+                    resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in s3_models:
+                logger.error("Couldn't reach server to download vocabulary.")
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find files {} "
+                    "at this path or url.".format(
+                        pretrained_model_name_or_path, ', '.join(s3_models),
+                        pretrained_model_name_or_path, str(vocab_files.keys())))
+            return None
+
+        for file_id, file_path in vocab_files.items():
+            if file_path == resolved_vocab_files[file_id]:
+                logger.info("loading file {}".format(file_path))
+            else:
+                logger.info("loading file {} from cache at {}".format(
+                    file_path, resolved_vocab_files[file_id]))
+
+        if pretrained_model_name_or_path in cls.max_model_input_sizes:
+            # if we're using a pretrained model, ensure the tokenizer
+            # wont index sequences longer than the number of positional embeddings
+            max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+
+        # Instantiate tokenizer.
+        tokenizer = cls(*inputs, **resolved_vocab_files, **kwargs)
+
+        return tokenizer
+
+
+def clean_up_tokenization(out_string):
+    out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
+                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
+                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+    return out_string
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 26c73c56b2..e37f3888a3 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -26,30 +26,42 @@ from io import open
 from tqdm import tqdm
 
 from .file_utils import cached_path
-from .model_utils import clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 from .tokenization_bert import BasicTokenizer
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json",
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'vocab.json',
+    'merges_file': 'merges.txt',
+    'special_tokens_file': 'special_tokens.txt'
 }
-PRETRAINED_MERGES_ARCHIVE_MAP = {
-    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json",
+    },
+    'merges_file':
+    {
+        'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
+    },
+    'special_tokens_file':
+    {
+        'xlm-mlm-en-2048': None,
+    }
 }
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'xlm-mlm-en-2048': 512,
 }
-VOCAB_NAME = 'vocab.json'
-MERGES_NAME = 'merges.txt'
-SPECIAL_TOKENS_NAME = 'special_tokens.txt'
 
-INDEX= {
-  "bos_index": 0,
-  "eos_index": 1,
-  "pad_index": 2,
-  "unk_index": 3,
-  "mask_index": 5
+INDEX = {
+    "bos_index": 0,
+    "eos_index": 1,
+    "pad_index": 2,
+    "unk_index": 3,
+    "mask_index": 5
 }
 
 def get_pairs(word):
@@ -79,7 +91,7 @@ def text_standardize(text):
     text = re.sub(r'[^\S\n]+', ' ', text)
     return text.strip()
 
-class XLMTokenizer(object):
+class XLMTokenizer(PreTrainedTokenizer):
     """
     BPE tokenizer for XLM, adapted from OpenAI BPE tokenizer. Peculiarities:
         - lower case all inputs
@@ -87,65 +99,11 @@ class XLMTokenizer(object):
         - argument special_tokens and function set_special_tokens:
             can be used to add additional symbols (ex: "__classify__") to a vocabulary.
     """
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
-        """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
-            special_tokens_file = None
-        else:
-            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
-            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
-            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
-            if not os.path.exists(special_tokens_file):
-                special_tokens_file = None
-            else:
-                logger.info("loading special tokens file {}".format(special_tokens_file))
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download vocabulary.".format(
-                        vocab_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                        pretrained_model_name_or_path,
-                        vocab_file, merges_file))
-            return None
-        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-            logger.info("loading merges file {}".format(merges_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-            logger.info("loading merges file {} from cache at {}".format(
-                merges_file, resolved_merges_file))
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
-            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
-            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        if special_tokens_file and 'special_tokens' not in kwargs:
-            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-        else:
-            special_tokens = kwargs.pop('special_tokens', [])
-        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
-        return tokenizer
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
+    def __init__(self, vocab_file, merges_file, special_tokens_file=None, special_tokens=None, max_len=None):
         try:
             import ftfy
             import spacy
@@ -164,9 +122,17 @@ class XLMTokenizer(object):
         merges = [tuple(merge.split()[:2]) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
+
+        all_special_tokens = []
+        if special_tokens_file is not None:
+            special_tokens_to_add = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
+            all_special_tokens.extend(special_tokens_to_add)
+        if special_tokens is not None and special_tokens:
+            all_special_tokens.extend(special_tokens)
+
         self.special_tokens = {}
         self.special_tokens_decoder = {}
-        self.set_special_tokens(special_tokens)
+        self.set_special_tokens(all_special_tokens)
 
     def __len__(self):
         return len(self.encoder) + len(self.special_tokens)
@@ -294,9 +260,9 @@ class XLMTokenizer(object):
         if not os.path.isdir(vocab_path):
             logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
             return
-        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
-        merge_file = os.path.join(vocab_path, MERGES_NAME)
-        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+        vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['merges_file'])
+        special_tokens_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['special_tokens_file'])
 
         with open(vocab_file, 'w', encoding='utf-8') as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index 76b9a9f870..a30e6db8da 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -27,15 +27,24 @@ import unicodedata
 import six
 
 from .file_utils import cached_path
-from .model_utils import clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
+VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
     'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model",
+    }
 }
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'xlnet-large-cased': 512,
+}
+
 VOCAB_NAME = 'spiece.model'
-SPECIAL_TOKENS_NAME = 'special_tokens.txt'
 
 SPIECE_UNDERLINE = u'▁'
 
@@ -46,7 +55,7 @@ SEG_ID_CLS = 2
 SEG_ID_SEP = 3
 SEG_ID_PAD = 4
 
-class XLNetTokenizer(object):
+class XLNetTokenizer(PreTrainedTokenizer):
     """
         SentencePiece based tokenizer. Peculiarities:
             - requires SentencePiece: https://github.com/google/sentencepiece
@@ -63,64 +72,11 @@ class XLNetTokenizer(object):
         "<eod>"  : 7,
         "<eop>"  : 8,
     }
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
-        """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-            special_tokens_file = None
-            if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
-                logger.warning("The pre-trained model you are loading is a cased model but you have not set "
-                               "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
-                               "you may want to check this behavior.")
-                kwargs['do_lower_case'] = False
-            elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
-                logger.warning("The pre-trained model you are loading is an uncased model but you have set "
-                               "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
-                               "but you may want to check this behavior.")
-                kwargs['do_lower_case'] = True
-        else:
-            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
-            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
-            if not os.path.exists(special_tokens_file):
-                special_tokens_file = None
-            else:
-                logger.info("loading special tokens file {}".format(special_tokens_file))
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download vocabulary.".format(
-                        vocab_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {}"
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                        pretrained_model_name_or_path,
-                        vocab_file))
-            return None
-        if resolved_vocab_file == vocab_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-        # Instantiate tokenizer.
-        if special_tokens_file and 'special_tokens' not in kwargs:
-            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-        else:
-            special_tokens = kwargs.pop('special_tokens', [])
-        tokenizer = cls(resolved_vocab_file, special_tokens=special_tokens, *inputs, **kwargs)
-        return tokenizer
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, special_tokens=None, max_len=None,
+    def __init__(self, vocab_file, max_len=None,
                  do_lower_case=False, remove_space=True, keep_accents=False):
         try:
             import sentencepiece as spm
@@ -136,9 +92,6 @@ class XLNetTokenizer(object):
 
         self.sp_model = spm.SentencePieceProcessor()
         self.sp_model.Load(vocab_file)
-        self.special_tokens = {}
-        self.special_tokens_decoder = {}
-        self.set_special_tokens(special_tokens)
 
     @property
     def UNK_TOKEN(self):
@@ -181,7 +134,7 @@ class XLNetTokenizer(object):
         return self.special_symbols["<mask>"]
 
     def __len__(self):
-        return len(self.encoder) + len(self.special_tokens)
+        return len(self.sp_model)
 
     def __getstate__(self):
         state = self.__dict__.copy()
@@ -198,19 +151,6 @@ class XLNetTokenizer(object):
         self.sp_model = spm.SentencePieceProcessor()
         self.sp_model.Load(self.vocab_file)
 
-    def set_special_tokens(self, special_tokens):
-        """ Add a list of additional tokens to the encoder.
-            The additional tokens are indexed starting from the last index of the
-            current vocabulary in the order of the `special_tokens` list.
-        """
-        if not special_tokens:
-            self.special_tokens = {}
-            self.special_tokens_decoder = {}
-            return
-        self.special_tokens = dict((tok, len(self.sp_model) + i) for i, tok in enumerate(special_tokens))
-        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
-        logger.info("Special tokens: %s", str(self.special_tokens))
-
     def preprocess_text(self, inputs):
         if self.remove_space:
             outputs = ' '.join(inputs.strip().split())
@@ -272,15 +212,9 @@ class XLNetTokenizer(object):
         """ Converts a sequence of tokens into ids using the vocab. """
         ids = []
         if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
-            if tokens in self.special_tokens:
-                return self.special_tokens[tokens]
-            else:
-                return self.sp_model.PieceToId(tokens)
+            return self.sp_model.PieceToId(tokens)
         for token in tokens:
-            if token in self.special_tokens:
-                ids.append(self.special_tokens[token])
-            else:
-                ids.append(self.sp_model.PieceToId(token))
+            ids.append(self.sp_model.PieceToId(token))
         if len(ids) > self.max_len:
             logger.warning(
                 "Token indices sequence length is longer than the specified maximum "
@@ -289,15 +223,11 @@ class XLNetTokenizer(object):
             )
         return ids
 
-    def convert_ids_to_tokens(self, ids, return_unicode=True, skip_special_tokens=False):
+    def convert_ids_to_tokens(self, ids, return_unicode=True):
         """Converts a sequence of ids in tokens."""
         tokens = []
         for i in ids:
-            if i in self.special_tokens_decoder:
-                if not skip_special_tokens:
-                    tokens.append(self.special_tokens_decoder[i])
-            else:
-                tokens.append(self.sp_model.IdToPiece(i))
+            tokens.append(self.sp_model.IdToPiece(i))
 
         if six.PY2 and return_unicode:
             ret_pieces = []
@@ -311,9 +241,9 @@ class XLNetTokenizer(object):
     def encode(self, text, sample=False):
         return self.convert_tokens_to_ids(self.tokenize(text, sample=sample))
 
-    def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+    def decode(self, ids, clean_up_tokenization_spaces=True):
         """Converts a sequence of ids in a string."""
-        tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
+        tokens = self.convert_ids_to_tokens(ids)
         out_string = ''.join(tokens)
         if clean_up_tokenization_spaces:
             out_string = out_string.strip().replace('<unk>', '')
@@ -328,18 +258,7 @@ class XLNetTokenizer(object):
             logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
             return
         out_vocab_file = os.path.join(vocab_path, VOCAB_NAME)
-        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
 
         copyfile(self.vocab_file, out_vocab_file)
 
-        index = len(self.sp_model)
-        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
-            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
-                    index = token_index
-                writer.write(token + u'\n')
-                index += 1
-
-        return out_vocab_file, special_tokens_file
+        return (out_vocab_file,)

From 6dacc79d395bd41e0ef76c2a043c2ef90cc79925 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 15:11:59 +0200
Subject: [PATCH 06/17] fix python2 tests

---
 pytorch_transformers/tests/tokenization_tests_commons.py | 6 ++----
 pytorch_transformers/tokenization_utils.py               | 6 +++++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index e8f7ee7a25..876f7747be 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -12,9 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import os
 import sys
@@ -47,7 +45,7 @@ def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, *
 def create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
     tokenizer = tokenizer_class(*inputs, **kwargs)
 
-    text = "Munich and Berlin are nice cities"
+    text = u"Munich and Berlin are nice cities"
     filename = u"/tmp/tokenizer.bin"
 
     subwords = tokenizer.tokenize(text)
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 98a2968539..c6f08c41ae 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -101,8 +101,12 @@ class PreTrainedTokenizer(object):
             max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
             kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
 
+        # Merge resolved_vocab_files arguments in kwargs.
+        for args_name, file_path in resolved_vocab_files.items():
+            kwargs[args_name] = file_path
+
         # Instantiate tokenizer.
-        tokenizer = cls(*inputs, **resolved_vocab_files, **kwargs)
+        tokenizer = cls(*inputs, **kwargs)
 
         return tokenizer
 

From 162ba383b05e502b9fc5df4d4abb5951c020d3bc Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 15:57:14 +0200
Subject: [PATCH 07/17] fix model loading

---
 examples/run_bert_classifier.py               |  3 ++-
 pytorch_transformers/modeling_utils.py        | 22 ++++++++++++++++++-
 .../tests/modeling_utils_test.py              |  7 ++++--
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/examples/run_bert_classifier.py b/examples/run_bert_classifier.py
index 506aecc5b1..6f3d26cee1 100644
--- a/examples/run_bert_classifier.py
+++ b/examples/run_bert_classifier.py
@@ -308,7 +308,8 @@ def main():
                 input_ids, input_mask, segment_ids, label_ids = batch
 
                 # define a new function to compute loss values for both output_modes
-                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
+                ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
+                loss = 
 
                 if output_mode == "classification":
                     loss_fct = CrossEntropyLoss()
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index b72707ce08..96558704ea 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -193,7 +193,8 @@ class PreTrainedModel(nn.Module):
         """
         state_dict = kwargs.pop('state_dict', None)
         cache_dir = kwargs.pop('cache_dir', None)
-        from_tf = kwargs.pop('from_tf', None)
+        from_tf = kwargs.pop('from_tf', False)
+        output_loading_info = kwargs.pop('output_loading_info', False)
 
         # Load config
         config = cls.config_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
@@ -239,6 +240,21 @@ class PreTrainedModel(nn.Module):
             # Directly load from a TensorFlow checkpoint
             return cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
 
+        # Convert old format to new format if needed from a PyTorch state_dict
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
         # Load from a PyTorch state_dict
         missing_keys = []
         unexpected_keys = []
@@ -279,6 +295,10 @@ class PreTrainedModel(nn.Module):
         if hasattr(model, 'tie_weights'):
             model.tie_weights()  # make sure word embedding weights are still tied
 
+        if output_loading_info:
+            loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs}
+            return model, loading_info
+
         return model
 
 
diff --git a/pytorch_transformers/tests/modeling_utils_test.py b/pytorch_transformers/tests/modeling_utils_test.py
index 1866d35353..5e3b8e676a 100644
--- a/pytorch_transformers/tests/modeling_utils_test.py
+++ b/pytorch_transformers/tests/modeling_utils_test.py
@@ -17,21 +17,24 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
+import logging
 
 from pytorch_transformers import PretrainedConfig, PreTrainedModel
 from pytorch_transformers.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP
 
-
 class ModelUtilsTest(unittest.TestCase):
     def test_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
         for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             config = BertConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, PretrainedConfig)
 
-            model = BertModel.from_pretrained(model_name)
+            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
             self.assertIsNotNone(model)
             self.assertIsInstance(model, PreTrainedModel)
+            for value in loading_info.values():
+                self.assertEqual(len(value), 0)
 
             config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
             model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)

From 1113f97f33ca939457370c767cad345a1a949fda Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 16:31:13 +0200
Subject: [PATCH 08/17] clean up glue example

---
 examples/run_bert_classifier.py            |  20 +-
 examples/run_glue.py                       | 401 +++++++++++++++++++++
 examples/utils_glue.py                     |   1 +
 pytorch_transformers/tokenization_utils.py |  18 +
 4 files changed, 423 insertions(+), 17 deletions(-)
 create mode 100644 examples/run_glue.py

diff --git a/examples/run_bert_classifier.py b/examples/run_bert_classifier.py
index 6f3d26cee1..27b8e6165d 100644
--- a/examples/run_bert_classifier.py
+++ b/examples/run_bert_classifier.py
@@ -309,14 +309,7 @@ def main():
 
                 # define a new function to compute loss values for both output_modes
                 ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
-                loss = 
-
-                if output_mode == "classification":
-                    loss_fct = CrossEntropyLoss()
-                    loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-                elif output_mode == "regression":
-                    loss_fct = MSELoss()
-                    loss = loss_fct(logits.view(-1), label_ids.view(-1))
+                loss = ouputs[0]
 
                 if n_gpu > 1:
                     loss = loss.mean() # mean() to average on multi-gpu.
@@ -423,15 +416,8 @@ def main():
             label_ids = label_ids.to(device)
 
             with torch.no_grad():
-                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
-
-            # create eval loss and other metric required by the task
-            if output_mode == "classification":
-                loss_fct = CrossEntropyLoss()
-                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-            elif output_mode == "regression":
-                loss_fct = MSELoss()
-                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
+                outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
+                tmp_eval_loss, logits = outputs[:2]
 
             eval_loss += tmp_eval_loss.mean().item()
             nb_eval_steps += 1
diff --git a/examples/run_glue.py b/examples/run_glue.py
new file mode 100644
index 0000000000..da1e8d8123
--- /dev/null
+++ b/examples/run_glue.py
@@ -0,0 +1,401 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import os
+import sys
+import random
+from tqdm import tqdm, trange
+
+import numpy as np
+
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from tensorboardX import SummaryWriter
+
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForSequenceClassification
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
+
+from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--bert_model", default=None, type=str, required=True,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
+                        "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--task_name", default=None, type=str, required=True,
+                        help="The name of the task to train.")
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+
+    ## Other parameters
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--train_batch_size", default=32, type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--eval_batch_size", default=8, type=int,
+                        help="Total batch size for eval.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--warmup_proportion", default=0.1, type=float,
+                        help="Proportion of training to perform linear learning rate warmup for. "
+                             "E.g., 0.1 = 10%% of training.")
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Avoid using CUDA when available")
+    parser.add_argument('--overwrite_output_dir', action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+
+    parser.add_argument('--fp16', action='store_true',
+                        help="Whether to use 16-bit float precision instead of 32-bit")
+    parser.add_argument('--loss_scale', type=float, default=0,
+                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
+                             "0 (default value): dynamic loss scaling.\n"
+                             "Positive power of 2: static loss scaling value.\n")
+
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="local_rank for distributed training on gpus")
+
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend='nccl')
+        n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+                device, n_gpu, bool(args.local_rank != -1), args.fp16))
+
+    # Setup seeds
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    # Safety checks and create output directory
+    if not args.do_train and not args.do_eval:
+        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(args.output_dir)
+
+    # Prepare GLUE task
+    task_name = args.task_name.lower()
+    if task_name not in processors:
+        raise ValueError("Task not found: %s" % (task_name))
+    processor = processors[task_name]()
+    output_mode = output_modes[task_name]
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        # Make sure only the first process in distributed training will download model & vocab
+        torch.distributed.barrier()
+
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
+    model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()
+
+    # Distributed, parrallel and fp16 model
+    if args.fp16:
+        model.half()
+    model.to(device)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model,
+                                                          device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    global_step = 0
+    tr_loss = 0
+    if args.do_train:
+        if args.local_rank in [-1, 0]:
+            tb_writer = SummaryWriter()
+
+        # Load and cache data
+        train_examples = processor.get_train_examples(args.data_dir)
+        cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format(
+            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(task_name)))
+        if os.path.exists(cached_train_features_file):
+            train_features = torch.load(cached_train_features_file)
+        else:
+            train_features = convert_examples_to_features(
+                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+                logger.info("  Saving train features into cached file %s", cached_train_features_file)
+                torch.save(train_features, cached_train_features_file)
+
+        # Convert in tensors and build dataloader
+        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
+        if output_mode == "classification":
+            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
+        elif output_mode == "regression":
+            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
+
+        args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+        train_sampler = RandomSampler(train_data) if args.local_rank == -1 else DistributedSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+        # Prepare optimizer
+        param_optimizer = list(model.named_parameters())
+        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+            ]
+        if args.fp16:
+            try:
+                from apex.optimizers import FP16_Optimizer, FusedAdam
+            except ImportError:
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+            optimizer = FusedAdam(optimizer_grouped_parameters,
+                                  lr=args.learning_rate,
+                                  bias_correction=False,
+                                  max_grad_norm=1.0)
+            if args.loss_scale == 0:
+                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+            else:
+                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
+                                                 t_total=num_train_optimization_steps)
+
+        else:
+            optimizer = BertAdam(optimizer_grouped_parameters,
+                                 lr=args.learning_rate,
+                                 warmup=args.warmup_proportion,
+                                 t_total=num_train_optimization_steps)
+
+        # Train!
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_examples))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
+        model.train()
+        for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, input_mask, segment_ids, label_ids = batch
+
+                ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
+                loss = ouputs[0]
+
+                if n_gpu > 1:
+                    loss = loss.mean() # mean() to average on multi-gpu parallel training
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+
+                if args.fp16:
+                    optimizer.backward(loss)
+                else:
+                    loss.backward()
+
+                tr_loss += loss.item()
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    if args.fp16:
+                        # modify learning rate with special warm up BERT uses
+                        # if args.fp16 is False, BertAdam is used that handles this automatically
+                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
+                        for param_group in optimizer.param_groups:
+                            param_group['lr'] = lr_this_step
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+                    if args.local_rank in [-1, 0]:
+                        if not args.fp16:
+                            tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                        tb_writer.add_scalar('loss', loss.item(), global_step)
+
+    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    ### Example:
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Save a trained model, configuration and tokenizer
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+
+        torch.save(model_to_save.state_dict(), output_model_file)
+        model_to_save.config.to_json_file(output_config_file)
+        tokenizer.save_vocabulary(args.output_dir)
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = BertForSequenceClassification.from_pretrained(args.output_dir)
+        tokenizer = BertTokenizer.from_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
+        torch.save(args, output_args_file)
+    else:
+        model = BertForSequenceClassification.from_pretrained(args.bert_model)
+
+    model.to(device)
+
+    ### Evaluation
+    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        eval_task_names = ("mnli", "mnli-mm") if task_name == "mnli" else (task_name,)
+        eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if task_name == "mnli" else (args.output_dir,)
+        for eval_task, output_dir in zip(eval_task_names, eval_outputs_dirs):
+            if os.path.exists(output_dir) and os.listdir(output_dir) and args.do_train:
+                raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+
+            # Load and cache data
+            processor = processors[eval_task]()
+            eval_examples = processor.get_dev_examples(args.data_dir)
+            cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
+                list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(eval_task)))
+            if os.path.exists(cached_eval_features_file):
+                eval_features = torch.load(cached_eval_features_file)
+            else:
+                eval_features = convert_examples_to_features(
+                    eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
+                if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+                    logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
+                torch.save(eval_features, cached_eval_features_file)
+
+            # Convert in tensors and build dataloader
+            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+            if output_mode == "classification":
+                all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
+            elif output_mode == "regression":
+                all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
+
+            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+            # Note that DistributedSampler samples randomly
+            eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
+            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+            # Eval!
+            logger.info("***** Running evaluation *****")
+            logger.info("  Num examples = %d", len(eval_examples))
+            logger.info("  Batch size = %d", args.eval_batch_size)
+            model.eval()
+            eval_loss = 0
+            nb_eval_steps = 0
+            preds = None
+            out_label_ids = None
+            for batch in tqdm(eval_dataloader, desc="Evaluating"):
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, input_mask, segment_ids, label_ids = batch
+
+                with torch.no_grad():
+                    outputs = model(input_ids,
+                                    token_type_ids=segment_ids,
+                                    attention_mask=input_mask,
+                                    labels=label_ids)
+                    tmp_eval_loss, logits = outputs[:2]
+
+                eval_loss += tmp_eval_loss.mean().item()
+                nb_eval_steps += 1
+                if preds is None:
+                    preds = logits.detach().cpu().numpy()
+                    out_label_ids = label_ids.detach().cpu().numpy()
+                else:
+                    preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                    out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
+
+            eval_loss = eval_loss / nb_eval_steps
+            if output_mode == "classification":
+                preds = np.argmax(preds, axis=1)
+            elif output_mode == "regression":
+                preds = np.squeeze(preds)
+            result = compute_metrics(eval_task, preds, out_label_ids)
+
+            loss = tr_loss/global_step if args.do_train else None
+
+            result['eval_loss'] = eval_loss
+            result['global_step'] = global_step
+            result['loss'] = loss
+
+            output_eval_file = os.path.join(output_dir, "eval_results.txt")
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key in sorted(result.keys()):
+                    logger.info("  %s = %s", key, str(result[key]))
+                    writer.write("%s = %s\n" % (key, str(result[key])))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index e3e4179fae..18e733567d 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -583,6 +583,7 @@ processors = {
 output_modes = {
     "cola": "classification",
     "mnli": "classification",
+    "mnli-mm": "classification",
     "mrpc": "classification",
     "sst-2": "classification",
     "sts-b": "regression",
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index c6f08c41ae..9004315657 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -110,6 +110,24 @@ class PreTrainedTokenizer(object):
 
         return tokenizer
 
+    def tokenize(self, text):
+        raise NotImplementedError
+
+    def convert_tokens_to_ids(self, tokens):
+        raise NotImplementedError
+
+    def convert_ids_to_tokens(self, ids):
+        raise NotImplementedError
+
+    def encode(self, text):
+        raise NotImplementedError
+
+    def decode(self, token_ids, *input, **kwargs):
+        raise NotImplementedError
+
+    def save_vocabulary(self, vocab_path):
+        raise NotImplementedError
+
 
 def clean_up_tokenization(out_string):
     out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','

From 99b90edab1c144ba92ff59e50a2811936a09550c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 17:09:35 +0200
Subject: [PATCH 09/17] cleaning up run_glue example

---
 examples/run_glue.py | 413 +++++++++++++++++++++----------------------
 1 file changed, 204 insertions(+), 209 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index da1e8d8123..1c2e921ef7 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -20,7 +20,6 @@ from __future__ import absolute_import, division, print_function
 import argparse
 import logging
 import os
-import sys
 import random
 from tqdm import tqdm, trange
 
@@ -30,7 +29,6 @@ import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
-from torch.nn import CrossEntropyLoss, MSELoss
 
 from tensorboardX import SummaryWriter
 
@@ -45,6 +43,186 @@ from utils_glue import processors, output_modes, convert_examples_to_features, c
 logger = logging.getLogger(__name__)
 
 
+def train(args, train_features, model):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    # Convert in tensors and build dataloader
+    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
+    if args.output_mode == "classification":
+        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
+    elif args.output_mode == "regression":
+        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
+
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    train_sampler = RandomSampler(train_data) if args.local_rank == -1 else DistributedSampler(train_data)
+    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer
+    param_optimizer = list(model.named_parameters())
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    if args.fp16:
+        try:
+            from apex.optimizers import FP16_Optimizer, FusedAdam
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0)
+        if args.loss_scale == 0:
+            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+        else:
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps)
+
+    else:
+        optimizer = BertAdam(optimizer_grouped_parameters,
+                                lr=args.learning_rate,
+                                warmup=args.warmup_proportion,
+                                t_total=num_train_optimization_steps)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_features))
+    logger.info("  Batch size = %d", args.train_batch_size)
+    logger.info("  Num steps = %d", num_train_optimization_steps)
+
+    global_step = 0
+    tr_loss = 0
+    model.train()
+    for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
+        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+            batch = tuple(t.to(args.device) for t in batch)
+            input_ids, input_mask, segment_ids, label_ids = batch
+
+            ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
+            loss = ouputs[0]
+
+            if args.n_gpu > 1:
+                loss = loss.mean() # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            loss.backward() if not args.fp16 else optimizer.backward(loss)
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    # modify learning rate with special warm up BERT uses
+                    # if args.fp16 is False, BertAdam is used that handles this automatically
+                    lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
+                    for param_group in optimizer.param_groups:
+                        param_group['lr'] = lr_this_step
+                optimizer.step()
+                optimizer.zero_grad()
+                global_step += 1
+                if args.local_rank in [-1, 0]:
+                    if not args.fp16:
+                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', loss.item(), global_step)
+
+    return global_step, tr_loss / global_step
+
+
+def evalutate(args, eval_task, eval_output_dir, eval_features, model):
+    """ Evaluate the model """
+    if os.path.exists(eval_output_dir) and os.listdir(eval_output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(eval_output_dir))
+    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(eval_output_dir)
+
+    # Convert in tensors and build dataloader
+    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+    if args.output_mode == "classification":
+        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
+    elif args.output_mode == "regression":
+        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
+
+    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    # Note that DistributedSampler samples randomly
+    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # Eval!
+    logger.info("***** Running evaluation *****")
+    logger.info("  Num examples = %d", len(eval_examples))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    model.eval()
+    eval_loss = 0
+    nb_eval_steps = 0
+    preds = None
+    out_label_ids = None
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        batch = tuple(t.to(args.device) for t in batch)
+        input_ids, input_mask, segment_ids, label_ids = batch
+
+        with torch.no_grad():
+            outputs = model(input_ids,
+                            token_type_ids=segment_ids,
+                            attention_mask=input_mask,
+                            labels=label_ids)
+            tmp_eval_loss, logits = outputs[:2]
+
+        eval_loss += tmp_eval_loss.mean().item()
+        nb_eval_steps += 1
+        if preds is None:
+            preds = logits.detach().cpu().numpy()
+            out_label_ids = label_ids.detach().cpu().numpy()
+        else:
+            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+            out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
+
+    eval_loss = eval_loss / nb_eval_steps
+    if args.output_mode == "classification":
+        preds = np.argmax(preds, axis=1)
+    elif args.output_mode == "regression":
+        preds = np.squeeze(preds)
+    result = compute_metrics(eval_task, preds, out_label_ids)
+
+    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
+    with open(output_eval_file, "w") as writer:
+        logger.info("***** Eval results *****")
+        for key in sorted(result.keys()):
+            logger.info("  %s = %s", key, str(result[key]))
+            writer.write("%s = %s\n" % (key, str(result[key])))
+
+
+def load_and_cache_examples(args, task, tokenizer, eval=False):
+    processor = processors[task]()
+    output_mode = output_modes[task]
+    label_list = processor.get_labels()
+
+    # Load and cache data
+    processor = processors[task]()
+    examples = processor.get_dev_examples(args.data_dir)
+    cached_features_file = os.path.join(args.data_dir, '{}_{}_{}_{}'.format(
+        'dev' if eval else 'train',
+        list(filter(None, args.bert_model.split('/'))).pop(),
+        str(args.max_seq_length),
+        str(task)))
+
+    if os.path.exists(cached_features_file):
+        features = torch.load(cached_features_file)
+    else:
+        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode)
+        if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+            logger.info("  Saving eval features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    return features
+
+
 def main():
     parser = argparse.ArgumentParser()
 
@@ -118,40 +296,32 @@ def main():
     # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
+        args.n_gpu = torch.cuda.device_count()
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
         torch.distributed.init_process_group(backend='nccl')
-        n_gpu = 1
+        args.n_gpu = 1
     args.device = device
 
     # Setup logging
     logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
     logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-                device, n_gpu, bool(args.local_rank != -1), args.fp16))
+        device, args.n_gpu, bool(args.local_rank != -1), args.fp16))
 
     # Setup seeds
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
-    if n_gpu > 0:
+    if args.n_gpu > 0:
         torch.cuda.manual_seed_all(args.seed)
 
-    # Safety checks and create output directory
-    if not args.do_train and not args.do_eval:
-        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
     # Prepare GLUE task
-    task_name = args.task_name.lower()
-    if task_name not in processors:
-        raise ValueError("Task not found: %s" % (task_name))
-    processor = processors[task_name]()
-    output_mode = output_modes[task_name]
+    args.task_name = args.task_name.lower()
+    if args.task_name not in processors:
+        raise ValueError("Task not found: %s" % (args.task_name))
+    processor = processors[args.task_name]()
+    args.output_mode = output_modes[args.task_name]
     label_list = processor.get_labels()
     num_labels = len(label_list)
 
@@ -169,122 +339,23 @@ def main():
     # Distributed, parrallel and fp16 model
     if args.fp16:
         model.half()
-    model.to(device)
+    model.to(args.device)
     if args.local_rank != -1:
         model = torch.nn.parallel.DistributedDataParallel(model,
                                                           device_ids=[args.local_rank],
                                                           output_device=args.local_rank,
                                                           find_unused_parameters=True)
-    elif n_gpu > 1:
+    elif args.n_gpu > 1:
         model = torch.nn.DataParallel(model)
 
-    global_step = 0
-    tr_loss = 0
+    # Training
     if args.do_train:
-        if args.local_rank in [-1, 0]:
-            tb_writer = SummaryWriter()
+        train_features = load_and_cache_examples(args, args.task_name, tokenizer, eval=False)
+        global_step, tr_loss = train(args, train_features, model)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
-        # Load and cache data
-        train_examples = processor.get_train_examples(args.data_dir)
-        cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(task_name)))
-        if os.path.exists(cached_train_features_file):
-            train_features = torch.load(cached_train_features_file)
-        else:
-            train_features = convert_examples_to_features(
-                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
-            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving train features into cached file %s", cached_train_features_file)
-                torch.save(train_features, cached_train_features_file)
 
-        # Convert in tensors and build dataloader
-        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-        if output_mode == "classification":
-            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
-        elif output_mode == "regression":
-            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
-
-        args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-        train_sampler = RandomSampler(train_data) if args.local_rank == -1 else DistributedSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-
-        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-        # Prepare optimizer
-        param_optimizer = list(model.named_parameters())
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer, FusedAdam
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                                 t_total=num_train_optimization_steps)
-
-        else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                 lr=args.learning_rate,
-                                 warmup=args.warmup_proportion,
-                                 t_total=num_train_optimization_steps)
-
-        # Train!
-        logger.info("***** Running training *****")
-        logger.info("  Num examples = %d", len(train_examples))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
-        model.train()
-        for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-                batch = tuple(t.to(device) for t in batch)
-                input_ids, input_mask, segment_ids, label_ids = batch
-
-                ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
-                loss = ouputs[0]
-
-                if n_gpu > 1:
-                    loss = loss.mean() # mean() to average on multi-gpu parallel training
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-
-                tr_loss += loss.item()
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.fp16:
-                        # modify learning rate with special warm up BERT uses
-                        # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
-                        for param_group in optimizer.param_groups:
-                            param_group['lr'] = lr_this_step
-                    optimizer.step()
-                    optimizer.zero_grad()
-                    global_step += 1
-                    if args.local_rank in [-1, 0]:
-                        if not args.fp16:
-                            tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
-                        tb_writer.add_scalar('loss', loss.item(), global_step)
-
-    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    ### Example:
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Save a trained model, configuration and tokenizer
         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
@@ -307,94 +378,18 @@ def main():
     else:
         model = BertForSequenceClassification.from_pretrained(args.bert_model)
 
-    model.to(device)
+    model.to(args.device)
 
-    ### Evaluation
+    # Evaluation
     if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        eval_task_names = ("mnli", "mnli-mm") if task_name == "mnli" else (task_name,)
-        eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if task_name == "mnli" else (args.output_dir,)
-        for eval_task, output_dir in zip(eval_task_names, eval_outputs_dirs):
-            if os.path.exists(output_dir) and os.listdir(output_dir) and args.do_train:
-                raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-            if not os.path.exists(output_dir):
-                os.makedirs(output_dir)
+        # Handle MNLI double evaluation
+        eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
+        eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
 
-            # Load and cache data
-            processor = processors[eval_task]()
-            eval_examples = processor.get_dev_examples(args.data_dir)
-            cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
-                list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(eval_task)))
-            if os.path.exists(cached_eval_features_file):
-                eval_features = torch.load(cached_eval_features_file)
-            else:
-                eval_features = convert_examples_to_features(
-                    eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
-                if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                    logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
-                torch.save(eval_features, cached_eval_features_file)
+        for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
+            eval_features = load_and_cache_examples(args, eval_task, tokenizer, eval=True)
 
-            # Convert in tensors and build dataloader
-            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-            if output_mode == "classification":
-                all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
-            elif output_mode == "regression":
-                all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
-
-            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-            # Note that DistributedSampler samples randomly
-            eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
-            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-            # Eval!
-            logger.info("***** Running evaluation *****")
-            logger.info("  Num examples = %d", len(eval_examples))
-            logger.info("  Batch size = %d", args.eval_batch_size)
-            model.eval()
-            eval_loss = 0
-            nb_eval_steps = 0
-            preds = None
-            out_label_ids = None
-            for batch in tqdm(eval_dataloader, desc="Evaluating"):
-                batch = tuple(t.to(device) for t in batch)
-                input_ids, input_mask, segment_ids, label_ids = batch
-
-                with torch.no_grad():
-                    outputs = model(input_ids,
-                                    token_type_ids=segment_ids,
-                                    attention_mask=input_mask,
-                                    labels=label_ids)
-                    tmp_eval_loss, logits = outputs[:2]
-
-                eval_loss += tmp_eval_loss.mean().item()
-                nb_eval_steps += 1
-                if preds is None:
-                    preds = logits.detach().cpu().numpy()
-                    out_label_ids = label_ids.detach().cpu().numpy()
-                else:
-                    preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-                    out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
-
-            eval_loss = eval_loss / nb_eval_steps
-            if output_mode == "classification":
-                preds = np.argmax(preds, axis=1)
-            elif output_mode == "regression":
-                preds = np.squeeze(preds)
-            result = compute_metrics(eval_task, preds, out_label_ids)
-
-            loss = tr_loss/global_step if args.do_train else None
-
-            result['eval_loss'] = eval_loss
-            result['global_step'] = global_step
-            result['loss'] = loss
-
-            output_eval_file = os.path.join(output_dir, "eval_results.txt")
-            with open(output_eval_file, "w") as writer:
-                logger.info("***** Eval results *****")
-                for key in sorted(result.keys()):
-                    logger.info("  %s = %s", key, str(result[key]))
-                    writer.write("%s = %s\n" % (key, str(result[key])))
+            evalutate(args, eval_task, eval_output_dir, eval_features, model)
 
 
 if __name__ == "__main__":

From 3d5f2913864de28a57a339c4c0c9f7b6000a7d03 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 5 Jul 2019 17:22:15 +0200
Subject: [PATCH 10/17] updates to run_glue

---
 examples/run_glue.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 1c2e921ef7..8dd845a553 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -213,11 +213,12 @@ def load_and_cache_examples(args, task, tokenizer, eval=False):
         str(task)))
 
     if os.path.exists(cached_features_file):
+        logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
         features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode)
         if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-            logger.info("  Saving eval features into cached file %s", cached_features_file)
+            logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
 
     return features

From b19786985d2dde9f91e20f5ce01f78a0cf7b6d0c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 9 Jul 2019 10:25:18 +0200
Subject: [PATCH 11/17] unified tokenizer api and serialization + tests

---
 examples/run_glue.py                          |  78 ++--
 examples/run_xlnet_classifier.py              |   8 +-
 examples/utils_glue.py                        |   5 +-
 pytorch_transformers/__init__.py              |  18 +-
 ...onvert_transfo_xl_checkpoint_to_pytorch.py |   5 +-
 .../convert_xlm_checkpoint_to_pytorch.py      |   4 +-
 pytorch_transformers/modeling_bert.py         |   8 +-
 pytorch_transformers/modeling_gpt2.py         |   8 +-
 pytorch_transformers/modeling_openai.py       |   8 +-
 pytorch_transformers/modeling_transfo_xl.py   |   8 +-
 pytorch_transformers/modeling_utils.py        |  16 +
 pytorch_transformers/modeling_xlm.py          |   8 +-
 pytorch_transformers/modeling_xlnet.py        |   8 +-
 .../tests/modeling_bert_test.py               |   4 +-
 .../tests/modeling_tests_commons.py           |   2 +-
 .../tests/modeling_transfo_xl_test.py         |   4 +-
 .../tests/modeling_utils_test.py              |   4 +-
 .../tests/modeling_xlm_test.py                |   4 +-
 .../tests/modeling_xlnet_test.py              |   4 +-
 .../tests/tokenization_bert_test.py           |  20 +-
 .../tests/tokenization_gpt2_test.py           |  41 +-
 .../tests/tokenization_openai_test.py         |  43 +-
 .../tests/tokenization_tests_commons.py       |  63 ++-
 .../tests/tokenization_transfo_xl_test.py     |  28 +-
 .../tests/tokenization_utils_test.py          |  10 +
 .../tests/tokenization_xlm_test.py            |  43 +-
 .../tests/tokenization_xlnet_test.py          |  54 +--
 pytorch_transformers/tokenization_bert.py     | 141 ++-----
 pytorch_transformers/tokenization_gpt2.py     | 113 ++----
 pytorch_transformers/tokenization_openai.py   | 125 ++----
 .../tokenization_transfo_xl.py                |  51 +--
 pytorch_transformers/tokenization_utils.py    | 383 ++++++++++++++++--
 pytorch_transformers/tokenization_xlm.py      | 132 ++----
 pytorch_transformers/tokenization_xlnet.py    | 128 ++----
 34 files changed, 824 insertions(+), 755 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 8dd845a553..59583ed712 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -32,9 +32,11 @@ from torch.utils.data.distributed import DistributedSampler
 
 from tensorboardX import SummaryWriter
 
-from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_transformers.modeling_bert import BertForSequenceClassification
-from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers import (BertForSequenceClassification, XLNetForSequenceClassification,
+                                  XLMForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                  XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
+from pytorch_transformers import (BertTokenizer, XLNetTokenizer,
+                                  XLMTokenizer)
 from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
 
 from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
@@ -42,6 +44,21 @@ from utils_glue import processors, output_modes, convert_examples_to_features, c
 
 logger = logging.getLogger(__name__)
 
+ALL_MODELS = sum((tuple(m.keys()) for m in (BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                            XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                            XLM_PRETRAINED_MODEL_ARCHIVE_MAP)), ())
+
+MODEL_CLASSES = {
+    'bert': BertForSequenceClassification,
+    'xlnet': XLNetForSequenceClassification,
+    'xlm': XLMForSequenceClassification,
+}
+
+TOKENIZER_CLASSES = {
+    'bert': BertTokenizer,
+    'xlnet': XLNetTokenizer,
+    'xlm': XLMTokenizer,
+}
 
 def train(args, train_features, model):
     """ Train the model """
@@ -156,7 +173,7 @@ def evalutate(args, eval_task, eval_output_dir, eval_features, model):
 
     # Eval!
     logger.info("***** Running evaluation *****")
-    logger.info("  Num examples = %d", len(eval_examples))
+    logger.info("  Num examples = %d", len(eval_features))
     logger.info("  Batch size = %d", args.eval_batch_size)
     model.eval()
     eval_loss = 0
@@ -208,7 +225,7 @@ def load_and_cache_examples(args, task, tokenizer, eval=False):
     examples = processor.get_dev_examples(args.data_dir)
     cached_features_file = os.path.join(args.data_dir, '{}_{}_{}_{}'.format(
         'dev' if eval else 'train',
-        list(filter(None, args.bert_model.split('/'))).pop(),
+        list(filter(None, args.model_name.split('/'))).pop(),
         str(args.max_seq_length),
         str(task)))
 
@@ -217,6 +234,11 @@ def load_and_cache_examples(args, task, tokenizer, eval=False):
         features = torch.load(cached_features_file)
     else:
         features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode)
+        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
+            cls_token_at_end=bool(args.model_type not in ['bert', 'xlm']),
+            cls_token=tokenizer.cls_token,
+            sep_token=tokenizer.sep_token, cls_token_segment_id=2,
+            pad_on_left=True, pad_token_segment_id=4)
         if args.local_rank == -1 or torch.distributed.get_rank() == 0:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
@@ -230,12 +252,10 @@ def main():
     ## Required parameters
     parser.add_argument("--data_dir", default=None, type=str, required=True,
                         help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
-                        "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--model_name", default=None, type=str, required=True,
+                        help="Bert/XLNet/XLM pre-trained model selected in the list: " + ", ".join(ALL_MODELS))
     parser.add_argument("--task_name", default=None, type=str, required=True,
-                        help="The name of the task to train.")
+                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
     parser.add_argument("--output_dir", default=None, type=str, required=True,
                         help="The output directory where the model predictions and checkpoints will be written.")
 
@@ -243,9 +263,8 @@ def main():
     parser.add_argument("--cache_dir", default="", type=str,
                         help="Where do you want to store the pre-trained models downloaded from s3")
     parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, and sequences shorter \n"
-                             "than this will be padded.")
+                        help="The maximum total input sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.")
     parser.add_argument("--do_train", action='store_true',
                         help="Whether to run training.")
     parser.add_argument("--do_eval", action='store_true',
@@ -263,8 +282,7 @@ def main():
     parser.add_argument("--num_train_epochs", default=3.0, type=float,
                         help="Total number of training epochs to perform.")
     parser.add_argument("--warmup_proportion", default=0.1, type=float,
-                        help="Proportion of training to perform linear learning rate warmup for. "
-                             "E.g., 0.1 = 10%% of training.")
+                        help="Proportion of training with linear learning rate warmup (0.1 = 10%% of training).")
     parser.add_argument("--no_cuda", action='store_true',
                         help="Avoid using CUDA when available")
     parser.add_argument('--overwrite_output_dir', action='store_true',
@@ -331,8 +349,11 @@ def main():
         # Make sure only the first process in distributed training will download model & vocab
         torch.distributed.barrier()
 
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-    model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
+    args.model_type = args.model_name.lower().split('-')[0]
+    args.tokenizer_class = TOKENIZER_CLASSES[args.model_type]
+    args.model_class = MODEL_CLASSES[args.model_type]
+    tokenizer = args.tokenizer_class.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)
+    model = args.model_class.from_pretrained(args.model_name, num_labels=num_labels)
 
     if args.local_rank == 0:
         torch.distributed.barrier()
@@ -359,27 +380,16 @@ def main():
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-
-        torch.save(model_to_save.state_dict(), output_model_file)
-        model_to_save.config.to_json_file(output_config_file)
+        model.save_pretrained(args.output_dir)
         tokenizer.save_vocabulary(args.output_dir)
 
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = BertForSequenceClassification.from_pretrained(args.output_dir)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir)
-
         # Good practice: save your training arguments together with the trained model
-        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
-        torch.save(args, output_args_file)
-    else:
-        model = BertForSequenceClassification.from_pretrained(args.bert_model)
+        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
 
-    model.to(args.device)
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = args.model_class.from_pretrained(args.output_dir)
+        tokenizer = args.tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
 
     # Evaluation
     if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
diff --git a/examples/run_xlnet_classifier.py b/examples/run_xlnet_classifier.py
index 7cf8a8d877..35b0ebfbd1 100644
--- a/examples/run_xlnet_classifier.py
+++ b/examples/run_xlnet_classifier.py
@@ -211,8 +211,8 @@ def main():
             logger.info("No cache file at %s, preparing train features", cached_train_features_file)
             train_features = convert_examples_to_features(
                 train_examples, label_list, args.max_seq_length, tokenizer, output_mode,
-                cls_token_at_end=True, cls_token=tokenizer.CLS_TOKEN,
-                sep_token=tokenizer.SEP_TOKEN, cls_token_segment_id=2,
+                cls_token_at_end=True, cls_token=tokenizer.cls_token,
+                sep_token=tokenizer.sep_token, cls_token_segment_id=2,
                 pad_on_left=True, pad_token_segment_id=4)
             if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                 logger.info("  Saving train features into cached file %s", cached_train_features_file)
@@ -369,8 +369,8 @@ def main():
             logger.info("No cache file at %s, preparing eval features", cached_eval_features_file)
             eval_features = convert_examples_to_features(
                 eval_examples, label_list, args.max_seq_length, tokenizer, output_mode,
-                cls_token_at_end=True, cls_token=tokenizer.CLS_TOKEN,
-                sep_token=tokenizer.SEP_TOKEN, cls_token_segment_id=2,
+                cls_token_at_end=True, cls_token=tokenizer.cls_token,
+                sep_token=tokenizer.sep_token, cls_token_segment_id=2,
                 pad_on_left=True, pad_token_segment_id=4)
             if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                 logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index 18e733567d..4750592957 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -396,7 +396,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
                                  mask_padding_with_zero=True):
     """ Loads a data file into a list of `InputBatch`s
         `cls_token_at_end` define the location of the CLS token:
-            - False (BERT pattern): [CLS] + A + [SEP] + B + [SEP]
+            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
             - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
         `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
     """
@@ -489,8 +489,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
                     [str(x) for x in tokens]))
             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
             logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-            logger.info(
-                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
             logger.info("label: %s (id = %d)" % (example.label, label_id))
 
         features.append(
diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index 6dd78dfd02..c8f64a07de 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -11,22 +11,28 @@ from .modeling_bert import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
                        BertForSequenceClassification, BertForMultipleChoice,
                        BertForTokenClassification, BertForQuestionAnswering,
-                       load_tf_weights_in_bert)
+                       load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                       BERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
 from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
                               OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
-                              load_tf_weights_in_openai_gpt)
+                              load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                              OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
-                                  load_tf_weights_in_transfo_xl)
+                                  load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_gpt2 import (GPT2Config, GPT2Model,
                             GPT2LMHeadModel, GPT2DoubleHeadsModel,
-                            load_tf_weights_in_gpt2)
+                            load_tf_weights_in_gpt2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                            GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_xlnet import (XLNetConfig,
                              XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
                              XLNetForSequenceClassification, XLNetForQuestionAnswering,
-                             load_tf_weights_in_xlnet)
+                             load_tf_weights_in_xlnet, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                             XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_xlm import (XLMConfig, XLMModel,
                            XLMWithLMHeadModel, XLMForSequenceClassification,
-                           XLMForQuestionAnswering)
+                           XLMForQuestionAnswering, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                           XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
                           PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
 
diff --git a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
index 2d666a1f03..db23e5bffe 100755
--- a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -29,8 +29,7 @@ from pytorch_transformers.modeling_transfo_xl import (CONFIG_NAME,
                                                          TransfoXLConfig,
                                                          TransfoXLLMHeadModel,
                                                          load_tf_weights_in_transfo_xl)
-from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME,
-                                                             VOCAB_NAME)
+from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -53,7 +52,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
         with open(transfo_xl_dataset_file, "rb") as fp:
             corpus = pickle.load(fp, encoding="latin1")
         # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
-        pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME
+        pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['pretrained_vocab_file']
         print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
         corpus_vocab_dict = corpus.vocab.__dict__
         torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
diff --git a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
index 0cbe962cea..e5815252f1 100755
--- a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
@@ -24,7 +24,7 @@ import torch
 import numpy
 
 from pytorch_transformers.modeling_xlm import (CONFIG_NAME, WEIGHTS_NAME, XLMConfig, XLMModel)
-from pytorch_transformers.tokenization_xlm import MERGES_NAME, VOCAB_NAME
+from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
 
 
 def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
@@ -42,7 +42,7 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
     # Save pytorch-model
     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
-    pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME
+    pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' +  VOCAB_FILES_NAMES['vocab_file']
 
     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
     torch.save(model, pytorch_weights_dump_path)
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index b2a456209d..0dd72b2969 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -33,7 +33,7 @@ from .modeling_utils import WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrai
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_MODEL_ARCHIVE_MAP = {
+BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
@@ -49,7 +49,7 @@ PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
 }
 
-PRETRAINED_CONFIG_ARCHIVE_MAP = {
+BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
@@ -152,7 +152,7 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 class BertConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `BertModel`.
     """
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
                  vocab_size_or_config_json_file=30522,
@@ -543,7 +543,7 @@ class BertPreTrainedModel(PreTrainedModel):
         a simple interface for dowloading and loading pretrained models.
     """
     config_class = BertConfig
-    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_bert
     base_model_prefix = "bert"
 
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 090763cda1..9340ce8489 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -37,9 +37,9 @@ from .modeling_bert import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
+GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
                                 "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin"}
-PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
+GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
                                  "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"}
 
 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
@@ -103,7 +103,7 @@ def gelu(x):
 class GPT2Config(PretrainedConfig):
     """Configuration class to store the configuration of a `GPT2Model`.
     """
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(
         self,
@@ -358,7 +358,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
         a simple interface for dowloading and loading pretrained models.
     """
     config_class = GPT2Config
-    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_model_archive_map = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_gpt2
     base_model_prefix = "transformer"
 
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index b715b18371..4a3ff732f6 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -37,8 +37,8 @@ from .modeling_bert import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
-PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
+OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
+OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
 
 
 def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
@@ -130,7 +130,7 @@ ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
 class OpenAIGPTConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `OpenAIGPTModel`.
     """
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(
         self,
@@ -384,7 +384,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
         a simple interface for dowloading and loading pretrained models.
     """
     config_class = OpenAIGPTConfig
-    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_model_archive_map = OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_openai_gpt
     base_model_prefix = "transformer"
 
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 465577b002..35a1b635f9 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -41,10 +41,10 @@ from .modeling_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrai
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_MODEL_ARCHIVE_MAP = {
+TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin",
 }
-PRETRAINED_CONFIG_ARCHIVE_MAP = {
+TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
 }
 
@@ -179,7 +179,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
 class TransfoXLConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `TransfoXLModel`.
     """
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
                  vocab_size_or_config_json_file=267735,
@@ -838,7 +838,7 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
         a simple interface for dowloading and loading pretrained models.
     """
     config_class = TransfoXLConfig
-    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_model_archive_map = TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_transfo_xl
     base_model_prefix = "transformer"
 
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 96558704ea..b9be1a3813 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -169,6 +169,22 @@ class PreTrainedModel(nn.Module):
         model_to_prune = getattr(self, self.base_model_prefix, self)  # get the base model if needed
         model_to_prune._prune_heads(heads_to_prune)
 
+    def save_pretrained(self, save_directory):
+        """ Save a model with its configuration file to a directory, so that it
+            can be re-loaded using the `from_pretrained(save_directory)` class method.
+        """
+        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
+
+        # Only save the model it-self if we are using distributed training
+        model_to_save = self.module if hasattr(self, 'module') else self
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
+        output_config_file = os.path.join(save_directory, CONFIG_NAME)
+
+        torch.save(model_to_save.state_dict(), output_model_file)
+        model_to_save.config.to_json_file(output_config_file)
+
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         """
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 14f8848a42..c7ea294dbd 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -40,10 +40,10 @@ from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTra
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_MODEL_ARCHIVE_MAP = {
+XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-pytorch_model.bin",
 }
-PRETRAINED_CONFIG_ARCHIVE_MAP = {
+XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
 }
 
@@ -51,7 +51,7 @@ PRETRAINED_CONFIG_ARCHIVE_MAP = {
 class XLMConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `XLMModel`.
     """
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
                  vocab_size_or_config_json_file=30145,
@@ -357,7 +357,7 @@ class XLMPreTrainedModel(PreTrainedModel):
         a simple interface for dowloading and loading pretrained models.
     """
     config_class = XLMConfig
-    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_model_archive_map = XLM_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = None
     base_model_prefix = "transformer"
 
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 289dcbd9db..628dbe7450 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -38,10 +38,10 @@ from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTra
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_MODEL_ARCHIVE_MAP = {
+XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin",
 }
-PRETRAINED_CONFIG_ARCHIVE_MAP = {
+XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
 }
 
@@ -195,7 +195,7 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 class XLNetConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `XLNetModel`.
     """
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+    pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
                  vocab_size_or_config_json_file=32000,
@@ -593,7 +593,7 @@ class XLNetPreTrainedModel(PreTrainedModel):
         a simple interface for dowloading and loading pretrained models.
     """
     config_class = XLNetConfig
-    pretrained_model_archive_map = PRETRAINED_MODEL_ARCHIVE_MAP
+    pretrained_model_archive_map = XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_xlnet
     base_model_prefix = "transformer"
 
diff --git a/pytorch_transformers/tests/modeling_bert_test.py b/pytorch_transformers/tests/modeling_bert_test.py
index 2ba59317be..fbdce29366 100644
--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -24,7 +24,7 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
                                      BertForNextSentencePrediction, BertForPreTraining,
                                      BertForQuestionAnswering, BertForSequenceClassification,
                                      BertForTokenClassification, BertForMultipleChoice)
-from pytorch_transformers.modeling_bert import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
 
@@ -267,7 +267,7 @@ class BertModelTest(unittest.TestCase):
     @pytest.mark.slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
             self.assertIsNotNone(model)
diff --git a/pytorch_transformers/tests/modeling_tests_commons.py b/pytorch_transformers/tests/modeling_tests_commons.py
index b831f85552..db79b017c1 100644
--- a/pytorch_transformers/tests/modeling_tests_commons.py
+++ b/pytorch_transformers/tests/modeling_tests_commons.py
@@ -413,7 +413,7 @@ class GPTModelTester(object):
 
     def create_and_check_model_from_pretrained(self):
         cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(self.base_model_class.PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
             model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
             self.parent.assertIsNotNone(model)
diff --git a/pytorch_transformers/tests/modeling_transfo_xl_test.py b/pytorch_transformers/tests/modeling_transfo_xl_test.py
index f2906d879f..49ba1addf1 100644
--- a/pytorch_transformers/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -26,7 +26,7 @@ import pytest
 import torch
 
 from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
-from pytorch_transformers.modeling_transfo_xl import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
 
@@ -185,7 +185,7 @@ class TransfoXLModelTest(unittest.TestCase):
     @pytest.mark.slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in list(TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
             self.assertIsNotNone(model)
diff --git a/pytorch_transformers/tests/modeling_utils_test.py b/pytorch_transformers/tests/modeling_utils_test.py
index 5e3b8e676a..a168c24611 100644
--- a/pytorch_transformers/tests/modeling_utils_test.py
+++ b/pytorch_transformers/tests/modeling_utils_test.py
@@ -20,12 +20,12 @@ import unittest
 import logging
 
 from pytorch_transformers import PretrainedConfig, PreTrainedModel
-from pytorch_transformers.modeling_bert import BertModel, BertConfig, PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 class ModelUtilsTest(unittest.TestCase):
     def test_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             config = BertConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, PretrainedConfig)
diff --git a/pytorch_transformers/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py
index 9c511f21a8..6e2e082d19 100644
--- a/pytorch_transformers/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -21,7 +21,7 @@ import shutil
 import pytest
 
 from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
-from pytorch_transformers.modeling_xlm import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
 
@@ -251,7 +251,7 @@ class XLMModelTest(unittest.TestCase):
     @pytest.mark.slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
             self.assertIsNotNone(model)
diff --git a/pytorch_transformers/tests/modeling_xlnet_test.py b/pytorch_transformers/tests/modeling_xlnet_test.py
index b762426d2c..e167e2d2e8 100644
--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -26,7 +26,7 @@ import pytest
 import torch
 
 from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
-from pytorch_transformers.modeling_xlnet import PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
 
@@ -279,7 +279,7 @@ class XLNetModelTest(unittest.TestCase):
     @pytest.mark.slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in list(XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = XLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
             self.assertIsNotNone(model)
diff --git a/pytorch_transformers/tests/tokenization_bert_test.py b/pytorch_transformers/tests/tokenization_bert_test.py
index 37e20cc286..220bf45346 100644
--- a/pytorch_transformers/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -17,14 +17,12 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 from io import open
-import shutil
-import pytest
 
 from pytorch_transformers.tokenization_bert import (BasicTokenizer,
-                                                  BertTokenizer,
-                                                  WordpieceTokenizer,
-                                                  _is_control, _is_punctuation,
-                                                  _is_whitespace)
+                                                    BertTokenizer,
+                                                    WordpieceTokenizer,
+                                                    _is_control, _is_punctuation,
+                                                    _is_whitespace, VOCAB_FILES_NAMES)
 
 from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -33,13 +31,15 @@ class TokenizationTest(unittest.TestCase):
     def test_full_tokenizer(self):
         vocab_tokens = [
             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
-            "##ing", ","
+            "##ing", ",", "low", "lowest",
         ]
-        with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
+        vocab_directory = "/tmp/"
+        vocab_file = os.path.join(vocab_directory, VOCAB_FILES_NAMES['vocab_file'])
+        with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
             vocab_file = vocab_writer.name
 
-        create_and_check_tokenizer_commons(self, BertTokenizer, vocab_file)
+        create_and_check_tokenizer_commons(self, BertTokenizer, pretrained_model_name_or_path=vocab_directory)
 
         tokenizer = BertTokenizer(vocab_file)
 
@@ -80,7 +80,7 @@ class TokenizationTest(unittest.TestCase):
         vocab = {}
         for (i, token) in enumerate(vocab_tokens):
             vocab[token] = i
-        tokenizer = WordpieceTokenizer(vocab=vocab)
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
 
         self.assertListEqual(tokenizer.tokenize(""), [])
 
diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
index 8b06161b53..30959ceed1 100644
--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -17,8 +17,9 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
+import tempfile
 
-from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
+from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -28,31 +29,31 @@ class GPT2TokenizationTest(unittest.TestCase):
         """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
                  "lo", "low", "er",
-                 "low", "lowest", "newer", "wider"]
+                 "low", "lowest", "newer", "wider", "<unk>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
-        with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
-            fp.write(json.dumps(vocab_tokens))
-            vocab_file = fp.name
-        with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
-            fp.write("\n".join(merges))
-            merges_file = fp.name
+        special_tokens_map = {"unk_token": "<unk>"}
 
-        create_and_check_tokenizer_commons(self, GPT2Tokenizer, vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+            merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+            with open(vocab_file, "w") as fp:
+                fp.write(json.dumps(vocab_tokens))
+            with open(merges_file, "w") as fp:
+                fp.write("\n".join(merges))
 
-        tokenizer = GPT2Tokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
-        text = "lower"
-        bpe_tokens = ["low", "er"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
+            create_and_check_tokenizer_commons(self, GPT2Tokenizer, tmpdirname, **special_tokens_map)
 
-        input_tokens = tokens + ["<unk>"]
-        input_bpe_tokens = [13, 12, 16]
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+            tokenizer = GPT2Tokenizer(vocab_file, merges_file, **special_tokens_map)
+            text = "lower"
+            bpe_tokens = ["low", "er"]
+            tokens = tokenizer.tokenize(text)
+            self.assertListEqual(tokens, bpe_tokens)
 
-        os.remove(vocab_file)
-        os.remove(merges_file)
+            input_tokens = tokens + [tokenizer.unk_token]
+            input_bpe_tokens = [13, 12, 17]
+            self.assertListEqual(
+                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
 
 if __name__ == '__main__':
diff --git a/pytorch_transformers/tests/tokenization_openai_test.py b/pytorch_transformers/tests/tokenization_openai_test.py
index 3f8c49f888..22f7d70017 100644
--- a/pytorch_transformers/tests/tokenization_openai_test.py
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -17,10 +17,9 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
-import shutil
-import pytest
+import tempfile
 
-from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
+from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -32,31 +31,31 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
                  "w</w>", "r</w>", "t</w>",
                  "lo", "low", "er</w>",
-                 "low</w>", "lowest</w>", "newer</w>", "wider</w>"]
+                 "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
-        with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
-            fp.write(json.dumps(vocab_tokens))
-            vocab_file = fp.name
-        with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
-            fp.write("\n".join(merges))
-            merges_file = fp.name
 
-        create_and_check_tokenizer_commons(self, OpenAIGPTTokenizer, vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+            merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+            with open(vocab_file, "w") as fp:
+                fp.write(json.dumps(vocab_tokens))
+            with open(merges_file, "w") as fp:
+                fp.write("\n".join(merges))
 
-        tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
-        os.remove(vocab_file)
-        os.remove(merges_file)
+            create_and_check_tokenizer_commons(self, OpenAIGPTTokenizer, tmpdirname)
 
-        text = "lower"
-        bpe_tokens = ["low", "er</w>"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
+            tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file)
 
-        input_tokens = tokens + ["<unk>"]
-        input_bpe_tokens = [14, 15, 20]
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+            text = "lower"
+            bpe_tokens = ["low", "er</w>"]
+            tokens = tokenizer.tokenize(text)
+            self.assertListEqual(tokens, bpe_tokens)
+
+            input_tokens = tokens + ["<unk>"]
+            input_bpe_tokens = [14, 15, 20]
+            self.assertListEqual(
+                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
 
 if __name__ == '__main__':
diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index 876f7747be..07f962bcab 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import sys
 from io import open
+import tempfile
 
 if sys.version_info[0] == 3:
     unicode = str
@@ -28,22 +29,19 @@ else:
 
 
 def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
-    tokenizer = tokenizer_class(*inputs, **kwargs)
+    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
 
     before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
 
-    vocab_path="/tmp/"
-    output_files = tokenizer.save_vocabulary(vocab_path=vocab_path)
-    tokenizer = tokenizer.from_pretrained(vocab_path)
-
-    for f in output_files:
-        os.remove(f)
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        tokenizer.save_pretrained(tmpdirname)
+        tokenizer = tokenizer.from_pretrained(tmpdirname)
 
     after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
     tester.assertListEqual(before_tokens, after_tokens)
 
 def create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
-    tokenizer = tokenizer_class(*inputs, **kwargs)
+    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
 
     text = u"Munich and Berlin are nice cities"
     filename = u"/tmp/tokenizer.bin"
@@ -58,8 +56,54 @@ def create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs
     tester.assertListEqual(subwords, subwords_loaded)
 
 
+def create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
+    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
+
+    vocab_size = tokenizer.vocab_size
+    all_size = len(tokenizer)
+
+    tester.assertNotEqual(vocab_size, 0)
+    tester.assertEqual(vocab_size, all_size)
+
+    new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
+    added_toks = tokenizer.add_tokens(new_toks)
+    vocab_size_2 = tokenizer.vocab_size
+    all_size_2 = len(tokenizer)
+
+    tester.assertNotEqual(vocab_size_2, 0)
+    tester.assertEqual(vocab_size, vocab_size_2)
+    tester.assertEqual(added_toks, len(new_toks))
+    tester.assertEqual(all_size_2, all_size + len(new_toks))
+
+    tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
+    tester.assertGreaterEqual(len(tokens), 4)
+    tester.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+    tester.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+
+    new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
+                  'pad_token': "<<<<<|||>|>>>>|>"}
+    added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
+    vocab_size_3 = tokenizer.vocab_size
+    all_size_3 = len(tokenizer)
+
+    tester.assertNotEqual(vocab_size_3, 0)
+    tester.assertEqual(vocab_size, vocab_size_3)
+    tester.assertEqual(added_toks_2, len(new_toks_2))
+    tester.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
+
+    tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
+
+    tester.assertGreaterEqual(len(tokens), 6)
+    tester.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+    tester.assertGreater(tokens[0], tokens[1])
+    tester.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+    tester.assertGreater(tokens[-2], tokens[-3])
+    tester.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
+    tester.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
+
+
 def create_and_check_required_methods_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
-    tokenizer = tokenizer_class(*inputs, **kwargs)
+    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
 
     text = u"He is very happy, UNwant\u00E9d,running"
     tokens = tokenizer.tokenize(text)
@@ -75,5 +119,6 @@ def create_and_check_required_methods_tokenizer(tester, tokenizer_class, *inputs
 
 def create_and_check_tokenizer_commons(tester, tokenizer_class, *inputs, **kwargs):
     create_and_check_required_methods_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
+    create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
     create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
     create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
diff --git a/pytorch_transformers/tests/tokenization_transfo_xl_test.py b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
index f583e30b56..a4ddd357b9 100644
--- a/pytorch_transformers/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -17,10 +17,9 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 from io import open
-import shutil
-import pytest
+import tempfile
 
-from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer
+from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -28,22 +27,23 @@ class TransfoXLTokenizationTest(unittest.TestCase):
 
     def test_full_tokenizer(self):
         vocab_tokens = [
-            "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un", "running", ","
+            "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
+            "running", ",", "low", "l",
         ]
-        with open("/tmp/transfo_xl_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-            vocab_file = vocab_writer.name
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+            with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
-        create_and_check_tokenizer_commons(self, TransfoXLTokenizer, vocab_file=vocab_file, lower_case=True)
+            create_and_check_tokenizer_commons(self, TransfoXLTokenizer, tmpdirname, lower_case=True)
 
-        tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True)
-        os.remove(vocab_file)
+            tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True)
 
-        tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
-        self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
+            tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
+            self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
 
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
+            self.assertListEqual(
+                tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
 
     def test_full_tokenizer_lower(self):
         tokenizer = TransfoXLTokenizer(lower_case=True)
diff --git a/pytorch_transformers/tests/tokenization_utils_test.py b/pytorch_transformers/tests/tokenization_utils_test.py
index e8856d50c2..26ec2d7a39 100644
--- a/pytorch_transformers/tests/tokenization_utils_test.py
+++ b/pytorch_transformers/tests/tokenization_utils_test.py
@@ -17,6 +17,7 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
+import six
 
 from pytorch_transformers import PreTrainedTokenizer
 from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
@@ -27,8 +28,17 @@ class TokenizerUtilsTest(unittest.TestCase):
         for model_name in s3_models[:1]:
             tokenizer = tokenizer_class.from_pretrained(model_name)
             self.assertIsNotNone(tokenizer)
+            self.assertIsInstance(tokenizer, tokenizer_class)
             self.assertIsInstance(tokenizer, PreTrainedTokenizer)
 
+            for special_tok in tokenizer.all_special_tokens:
+                if six.PY2:
+                    self.assertIsInstance(special_tok, unicode)
+                else:
+                    self.assertIsInstance(special_tok, str)
+                special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
+                self.assertIsInstance(special_tok_id, int)
+
     def test_pretrained_tokenizers(self):
         self.check_tokenizer_from_pretrained(GPT2Tokenizer)
 
diff --git a/pytorch_transformers/tests/tokenization_xlm_test.py b/pytorch_transformers/tests/tokenization_xlm_test.py
index 00d273a628..b543ed23f8 100644
--- a/pytorch_transformers/tests/tokenization_xlm_test.py
+++ b/pytorch_transformers/tests/tokenization_xlm_test.py
@@ -17,10 +17,9 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
-import shutil
-import pytest
+import tempfile
 
-from pytorch_transformers.tokenization_xlm import XLMTokenizer
+from pytorch_transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -31,31 +30,31 @@ class XLMTokenizationTest(unittest.TestCase):
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
                  "w</w>", "r</w>", "t</w>",
                  "lo", "low", "er</w>",
-                 "low</w>", "lowest</w>", "newer</w>", "wider</w>"]
+                 "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
-        with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
-            fp.write(json.dumps(vocab_tokens))
-            vocab_file = fp.name
-        with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
-            fp.write("\n".join(merges))
-            merges_file = fp.name
 
-        create_and_check_tokenizer_commons(self, XLMTokenizer, vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+            merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+            with open(vocab_file, "w") as fp:
+                fp.write(json.dumps(vocab_tokens))
+            with open(merges_file, "w") as fp:
+                fp.write("\n".join(merges))
 
-        tokenizer = XLMTokenizer(vocab_file, merges_file, special_tokens=["<unk>", "<pad>"])
-        os.remove(vocab_file)
-        os.remove(merges_file)
+            create_and_check_tokenizer_commons(self, XLMTokenizer, tmpdirname)
 
-        text = "lower"
-        bpe_tokens = ["low", "er</w>"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
+            tokenizer = XLMTokenizer(vocab_file, merges_file)
 
-        input_tokens = tokens + ["<unk>"]
-        input_bpe_tokens = [14, 15, 20]
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+            text = "lower"
+            bpe_tokens = ["low", "er</w>"]
+            tokens = tokenizer.tokenize(text)
+            self.assertListEqual(tokens, bpe_tokens)
+
+            input_tokens = tokens + ["<unk>"]
+            input_bpe_tokens = [14, 15, 20]
+            self.assertListEqual(
+                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
 
 if __name__ == '__main__':
diff --git a/pytorch_transformers/tests/tokenization_xlnet_test.py b/pytorch_transformers/tests/tokenization_xlnet_test.py
index 6e81f214b7..8fc98209ba 100644
--- a/pytorch_transformers/tests/tokenization_xlnet_test.py
+++ b/pytorch_transformers/tests/tokenization_xlnet_test.py
@@ -16,10 +16,9 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import os
 import unittest
-import shutil
-import pytest
+import tempfile
 
-from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
+from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE, VOCAB_FILES_NAMES)
 
 from.tokenization_tests_commons import create_and_check_tokenizer_commons
 
@@ -29,34 +28,37 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
 class XLNetTokenizationTest(unittest.TestCase):
 
     def test_full_tokenizer(self):
-        create_and_check_tokenizer_commons(self, XLNetTokenizer, SAMPLE_VOCAB)
-
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
 
-        tokens = tokenizer.tokenize(u'This is a test')
-        self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            tokenizer.save_pretrained(tmpdirname)
 
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
+            create_and_check_tokenizer_commons(self, XLNetTokenizer, tmpdirname)
 
-        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                      u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
-                                      u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                      SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(
-            ids, [8, 21, 84, 55, 24, 19, 7, 0,
-                            602, 347, 347, 347, 3, 12, 66,
-                            46, 72, 80, 6, 0, 4])
+            tokens = tokenizer.tokenize(u'This is a test')
+            self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
 
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                           u'or', u'n', SPIECE_UNDERLINE + u'in',
-                                           SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
-                                           SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                           SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
-                                           u'<unk>', u'.'])
+            self.assertListEqual(
+                tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
+
+            tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
+            self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                        u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
+                                        u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                        SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
+            ids = tokenizer.convert_tokens_to_ids(tokens)
+            self.assertListEqual(
+                ids, [8, 21, 84, 55, 24, 19, 7, 0,
+                    602, 347, 347, 347, 3, 12, 66,
+                    46, 72, 80, 6, 0, 4])
+
+            back_tokens = tokenizer.convert_ids_to_tokens(ids)
+            self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                            u'or', u'n', SPIECE_UNDERLINE + u'in',
+                                            SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
+                                            SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                            SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
+                                            u'<unk>', u'.'])
 
     def test_tokenizer_lower(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index b26e5066e9..3e14673f46 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -22,7 +22,6 @@ import os
 import unicodedata
 from io import open
 
-from .file_utils import cached_path
 from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 
 logger = logging.getLogger(__name__)
@@ -32,20 +31,21 @@ VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
 PRETRAINED_VOCAB_FILES_MAP = {
     'vocab_file':
     {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
-    'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
-    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
-    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
-    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
-    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
-    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
-}}
+        'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+        'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+        'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+        'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+        'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+        'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+        'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+        'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
+        'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
+        'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
+        'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
+        'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
+        'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
+    }
+}
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'bert-base-uncased': 512,
@@ -93,8 +93,9 @@ class BertTokenizer(PreTrainedTokenizer):
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
-                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+    def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
+                 unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
+                 mask_token="[MASK]", **kwargs):
         """Constructs a BertTokenizer.
 
         Args:
@@ -102,17 +103,18 @@ class BertTokenizer(PreTrainedTokenizer):
           do_lower_case: Whether to lower case the input
                          Only has an effect when do_wordpiece_only=False
           do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-          max_len: An artificial maximum length to truncate tokenized sequences to;
-                         Effective maximum length is always the minimum of this
-                         value (if specified) and the underlying BERT model's
-                         sequence length.
           never_split: List of tokens which will never be split during tokenization.
                          Only has an effect when do_wordpiece_only=False
         """
+        super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
+                                            pad_token=pad_token, cls_token=cls_token,
+                                            mask_token=mask_token, **kwargs)
         if not os.path.isfile(vocab_file):
             raise ValueError(
                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                 "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        if never_split is None:
+            never_split = self.all_special_tokens
         self.vocab = load_vocab(vocab_file)
         self.ids_to_tokens = collections.OrderedDict(
             [(ids, tok) for tok, ids in self.vocab.items()])
@@ -120,90 +122,34 @@ class BertTokenizer(PreTrainedTokenizer):
         if do_basic_tokenize:
           self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
                                                 never_split=never_split)
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-        self.max_len = max_len if max_len is not None else int(1e12)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
 
     @property
-    def UNK_TOKEN(self):
-        return "[UNK]"
+    def vocab_size(self):
+        return len(self.vocab)
 
-    @property
-    def SEP_TOKEN(self):
-        return "[SEP]"
-
-    @property
-    def PAD_TOKEN(self):
-        return "[PAD]"
-
-    @property
-    def CLS_TOKEN(self):
-        return "[CLS]"
-
-    @property
-    def MASK_TOKEN(self):
-        return "[MASK]"
-
-    @property
-    def UNK_ID(self):
-        return self.vocab["[UNK]"]
-
-    @property
-    def SEP_ID(self):
-        return self.vocab["[SEP]"]
-
-    @property
-    def PAD_ID(self):
-        return self.vocab["[PAD]"]
-
-    @property
-    def CLS_ID(self):
-        return self.vocab["[CLS]"]
-
-    @property
-    def MASK_ID(self):
-        return self.vocab["[MASK]"]
-
-    def tokenize(self, text):
+    def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text):
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
                 for sub_token in self.wordpiece_tokenizer.tokenize(token):
                     split_tokens.append(sub_token)
         else:
             split_tokens = self.wordpiece_tokenizer.tokenize(text)
         return split_tokens
 
-    def convert_tokens_to_ids(self, tokens):
-        """Converts a sequence of tokens into ids using the vocab."""
-        ids = []
-        for token in tokens:
-            ids.append(self.vocab[token])
-        if len(ids) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this BERT model ({} > {}). Running this"
-                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
-            )
-        return ids
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
 
-    def convert_ids_to_tokens(self, ids):
-        """Converts a sequence of ids in wordpiece tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            tokens.append(self.ids_to_tokens[i])
-        return tokens
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
 
-    def encode(self, text):
-        return self.convert_tokens_to_ids(self.tokenize(text))
-
-    def decode(self, token_ids, clean_up_tokenization_spaces=True):
+    def _convert_ids_to_string(self, tokens_ids):
         """Converts a sequence of ids in a string."""
-        tokens = self.convert_ids_to_tokens(token_ids)
+        tokens = self.convert_ids_to_tokens(tokens_ids)
         out_string = ''.join(tokens).replace(' ##', '').strip()
-        if clean_up_tokenization_spaces:
-            for special_tok in (self.UNK_TOKEN, self.SEP_TOKEN, self.PAD_TOKEN, self.CLS_TOKEN, self.MASK_TOKEN):
-                out_string = out_string.replace(special_tok, '')
-            out_string = clean_up_tokenization(out_string)
         return out_string
 
     def save_vocabulary(self, vocab_path):
@@ -245,17 +191,20 @@ class BasicTokenizer(object):
 
     def __init__(self,
                  do_lower_case=True,
-                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+                 never_split=None):
         """Constructs a BasicTokenizer.
 
         Args:
           do_lower_case: Whether to lower case the input.
         """
+        if never_split is None:
+            never_split = []
         self.do_lower_case = do_lower_case
         self.never_split = never_split
 
-    def tokenize(self, text):
+    def tokenize(self, text, never_split=None):
         """Tokenizes a piece of text."""
+        never_split = self.never_split + (never_split if never_split is not None else [])
         text = self._clean_text(text)
         # This was added on November 1st, 2018 for the multilingual and Chinese
         # models. This is also applied to the English models now, but it doesn't
@@ -267,7 +216,7 @@ class BasicTokenizer(object):
         orig_tokens = whitespace_tokenize(text)
         split_tokens = []
         for token in orig_tokens:
-            if self.do_lower_case and token not in self.never_split:
+            if self.do_lower_case and token not in never_split:
                 token = token.lower()
                 token = self._run_strip_accents(token)
             split_tokens.extend(self._run_split_on_punc(token))
@@ -286,9 +235,9 @@ class BasicTokenizer(object):
             output.append(char)
         return "".join(output)
 
-    def _run_split_on_punc(self, text):
+    def _run_split_on_punc(self, text, never_split=None):
         """Splits punctuation on a piece of text."""
-        if text in self.never_split:
+        if never_split is not None and text in never_split:
             return [text]
         chars = list(text)
         i = 0
@@ -360,7 +309,7 @@ class BasicTokenizer(object):
 class WordpieceTokenizer(object):
     """Runs WordPiece tokenization."""
 
-    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
         self.vocab = vocab
         self.unk_token = unk_token
         self.max_input_chars_per_word = max_input_chars_per_word
diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index abdfe39c1c..af1ad2cf8f 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -38,7 +38,6 @@ logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {
     'vocab_file': 'vocab.json',
     'merges_file': 'merges.txt',
-    'special_tokens_file': 'special_tokens.txt'
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
@@ -52,11 +51,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
         'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
         'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
     },
-    'special_tokens_file':
-    {
-        'gpt2': None,
-        'gpt2-medium': None,
-    }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
@@ -108,8 +102,10 @@ class GPT2Tokenizer(PreTrainedTokenizer):
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, special_tokens_file=None, special_tokens=None, errors='replace', max_len=None):
-        self.max_len = max_len if max_len is not None else int(1e12)
+    def __init__(self, vocab_file, merges_file, errors='replace',
+                 bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
+        super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, **kwargs)
+
         self.encoder = json.load(open(vocab_file))
         self.decoder = {v:k for k,v in self.encoder.items()}
         self.errors = errors # how to handle errors in decoding
@@ -123,32 +119,9 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
-        all_special_tokens = []
-        if special_tokens_file is not None:
-            special_tokens_to_add = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-            all_special_tokens.extend(special_tokens_to_add)
-        if special_tokens is not None and special_tokens:
-            all_special_tokens.extend(special_tokens)
-
-        self.special_tokens = {}
-        self.special_tokens_decoder = {}
-        self.set_special_tokens(all_special_tokens)
-
-    def __len__(self):
-        return len(self.encoder) + len(self.special_tokens)
-
-    def set_special_tokens(self, special_tokens):
-        """ Add a list of additional tokens to the encoder.
-            The additional tokens are indexed starting from the last index of the
-            current vocabulary in the order of the `special_tokens` list.
-        """
-        if not special_tokens:
-            self.special_tokens = {}
-            self.special_tokens_decoder = {}
-            return
-        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
-        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
-        logger.info("Special tokens {}".format(self.special_tokens))
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
 
     def bpe(self, token):
         if token in self.cache:
@@ -191,7 +164,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         self.cache[token] = word
         return word
 
-    def tokenize(self, text):
+    def _tokenize(self, text):
         """ Tokenize a string. """
         bpe_tokens = []
         for token in re.findall(self.pat, text):
@@ -202,57 +175,27 @@ class GPT2Tokenizer(PreTrainedTokenizer):
             bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
         return bpe_tokens
 
-    def convert_tokens_to_ids(self, tokens):
-        """ Converts a sequence of tokens into ids using the vocab. """
-        ids = []
-        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
-            if tokens in self.special_tokens:
-                return self.special_tokens[tokens]
-            else:
-                return self.encoder.get(tokens, 0)
-        for token in tokens:
-            if token in self.special_tokens:
-                ids.append(self.special_tokens[token])
-            else:
-                ids.append(self.encoder.get(token, 0))
-        if len(ids) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this OpenAI GPT model ({} > {}). Running this"
-                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
-            )
-        return ids
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
 
-    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-        """Converts a sequence of ids in BPE tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            if i in self.special_tokens_decoder:
-                if not skip_special_tokens:
-                    tokens.append(self.special_tokens_decoder[i])
-            else:
-                tokens.append(self.decoder[i])
-        return tokens
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
 
-    def encode(self, text):
-        return self.convert_tokens_to_ids(self.tokenize(text))
-
-    def decode(self, tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True):
-        text = ''.join(self.convert_ids_to_tokens(tokens, skip_special_tokens=skip_special_tokens))
+    def _convert_ids_to_string(self, tokens_ids):
+        """Converts a sequence of ids in a string."""
+        text = ''.join(tokens_ids)
         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
-        if clean_up_tokenization_spaces:
-            text = text.replace('<unk>', '')
-            text = clean_up_tokenization(text)
         return text
 
-    def save_vocabulary(self, vocab_path):
+    def save_vocabulary(self, save_directory):
         """Save the tokenizer vocabulary and merge files to a directory."""
-        if not os.path.isdir(vocab_path):
-            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
-        merge_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['merges_file'])
-        special_tokens_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['special_tokens_file'])
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
 
         with open(vocab_file, 'w', encoding='utf-8') as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
@@ -268,14 +211,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
                 writer.write(' '.join(bpe_tokens) + u'\n')
                 index += 1
 
-        index = len(self.encoder)
-        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
-            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
-                    index = token_index
-                writer.write(token + u'\n')
-                index += 1
-
-        return vocab_file, merge_file, special_tokens_file
+        return vocab_file, merge_file
diff --git a/pytorch_transformers/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py
index 419dfdad92..16d355c57d 100644
--- a/pytorch_transformers/tokenization_openai.py
+++ b/pytorch_transformers/tokenization_openai.py
@@ -20,13 +20,9 @@ import json
 import logging
 import os
 import re
-import sys
 from io import open
 
-from tqdm import tqdm
-
-from .file_utils import cached_path
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_bert import BasicTokenizer
 
 logger = logging.getLogger(__name__)
@@ -34,7 +30,6 @@ logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {
     'vocab_file': 'vocab.json',
     'merges_file': 'merges.txt',
-    'special_tokens_file': 'special_tokens.txt'
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
@@ -46,10 +41,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
     {
         'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
     },
-    'special_tokens_file':
-    {
-        'openai-gpt': None,
-    }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
@@ -88,14 +79,14 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
     BPE tokenizer. Peculiarities:
         - lower case all inputs
         - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
-        - argument special_tokens and function set_special_tokens:
-            can be used to add additional symbols (ex: "__classify__") to a vocabulary.
     """
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, special_tokens_file=None, special_tokens=None, max_len=None):
+    def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
+        super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)
+
         try:
             import ftfy
             import spacy
@@ -103,11 +94,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
             self.fix_text = ftfy.fix_text
         except ImportError:
             logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
-            self.nlp = BasicTokenizer(do_lower_case=True,
-                                      never_split=special_tokens if special_tokens is not None else [])
+            self.nlp = BasicTokenizer(do_lower_case=True)
             self.fix_text = None
 
-        self.max_len = max_len if max_len is not None else int(1e12)
         self.encoder = json.load(open(vocab_file, encoding="utf-8"))
         self.decoder = {v:k for k,v in self.encoder.items()}
         merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
@@ -115,35 +104,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
 
-        all_special_tokens = []
-        if special_tokens_file is not None:
-            special_tokens_to_add = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-            all_special_tokens.extend(special_tokens_to_add)
-        if special_tokens is not None and special_tokens:
-            all_special_tokens.extend(special_tokens)
-
-        self.special_tokens = {}
-        self.special_tokens_decoder = {}
-        self.set_special_tokens(all_special_tokens)
-
-    def __len__(self):
-        return len(self.encoder) + len(self.special_tokens)
-
-    def set_special_tokens(self, special_tokens):
-        """ Add a list of additional tokens to the encoder.
-            The additional tokens are indexed starting from the last index of the
-            current vocabulary in the order of the `special_tokens` list.
-        """
-        if not special_tokens:
-            self.special_tokens = {}
-            self.special_tokens_decoder = {}
-            return
-        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
-        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
-        if self.fix_text is None:
-            # Using BERT's BasicTokenizer: we can update the tokenizer
-            self.nlp.never_split = special_tokens
-        logger.info("Special tokens {}".format(self.special_tokens))
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
 
     def bpe(self, token):
         word = tuple(token[:-1]) + (token[-1] + '</w>',)
@@ -188,7 +151,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
         self.cache[token] = word
         return word
 
-    def tokenize(self, text):
+    def _tokenize(self, text):
         """ Tokenize a string. """
         split_tokens = []
         if self.fix_text is None:
@@ -203,58 +166,26 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
                 split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
         return split_tokens
 
-    def convert_tokens_to_ids(self, tokens):
-        """ Converts a sequence of tokens into ids using the vocab. """
-        ids = []
-        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
-            if tokens in self.special_tokens:
-                return self.special_tokens[tokens]
-            else:
-                return self.encoder.get(tokens, 0)
-        for token in tokens:
-            if token in self.special_tokens:
-                ids.append(self.special_tokens[token])
-            else:
-                ids.append(self.encoder.get(token, 0))
-        if len(ids) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this OpenAI GPT model ({} > {}). Running this"
-                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
-            )
-        return ids
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
 
-    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-        """Converts a sequence of ids in BPE tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            if i in self.special_tokens_decoder:
-                if not skip_special_tokens:
-                    tokens.append(self.special_tokens_decoder[i])
-            else:
-                tokens.append(self.decoder[i])
-        return tokens
+    def _convert_id_to_token(self, index):
+        """Converts an id in a token (BPE) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
 
-    def encode(self, text):
-        return self.convert_tokens_to_ids(self.tokenize(text))
-
-    def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+    def _convert_ids_to_string(self, tokens_ids):
         """Converts a sequence of ids in a string."""
-        tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
-        out_string = ''.join(tokens).replace('</w>', ' ').strip()
-        if clean_up_tokenization_spaces:
-            out_string = out_string.replace('<unk>', '')
-            out_string = clean_up_tokenization(out_string)
+        out_string = ''.join(tokens_ids).replace('</w>', ' ').strip()
         return out_string
 
-    def save_vocabulary(self, vocab_path):
+    def save_vocabulary(self, save_directory):
         """Save the tokenizer vocabulary and merge files to a directory."""
-        if not os.path.isdir(vocab_path):
-            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
-        merge_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['merges_file'])
-        special_tokens_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['special_tokens_file'])
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
 
         with open(vocab_file, 'w', encoding='utf-8') as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
@@ -270,14 +201,4 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
                 writer.write(' '.join(bpe_tokens) + u'\n')
                 index += 1
 
-        index = len(self.encoder)
-        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
-            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
-                    index = token_index
-                writer.write(token + u'\n')
-                index += 1
-
-        return vocab_file, merge_file, special_tokens_file
+        return vocab_file, merge_file
diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py
index a86c8fe460..0b4e8c0ca5 100644
--- a/pytorch_transformers/tokenization_transfo_xl.py
+++ b/pytorch_transformers/tokenization_transfo_xl.py
@@ -41,7 +41,7 @@ else:
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {'pretrained_vocab_file': 'vocab.bin'}
+VOCAB_FILES_NAMES = {'pretrained_vocab_file': 'vocab.bin', 'vocab_file': 'vocab.txt'}
 
 PRETRAINED_VOCAB_FILES_MAP = {
     'pretrained_vocab_file':
@@ -67,9 +67,17 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, special=[], min_freq=0, max_size=None, lower_case=False,
+    def __init__(self, special=None, min_freq=0, max_size=None, lower_case=False,
                  delimiter=None, vocab_file=None, pretrained_vocab_file=None,
-                 never_split=("<unk>", "<eos>", "<formula>")):
+                 never_split=None, unk_token="<unk>", eos_token="<eos>",
+                 additional_special_tokens=["<formula>"], **kwargs):
+        super(TransfoXLTokenizer, self).__init__(unk_token=unk_token, eos_token=eos_token,
+                                                 additional_special_tokens=additional_special_tokens,
+                                                 **kwargs)
+        if never_split is None:
+            never_split = self.all_special_tokens
+        if special is None:
+            special = []
         self.counter = Counter()
         self.special = special
         self.min_freq = min_freq
@@ -200,11 +208,13 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
             self.idx2sym.append(sym)
             self.sym2idx[sym] = len(self.idx2sym) - 1
 
-    def get_sym(self, idx):
+    def _convert_id_to_token(self, idx):
+        """Converts an id in a token (BPE) using the vocab."""
         assert 0 <= idx < len(self), 'Index {} out of vocabulary range'.format(idx)
         return self.idx2sym[idx]
 
-    def get_idx(self, sym):
+    def _convert_token_to_id(self, sym):
+        """ Converts a token (str/unicode) in an id using the vocab. """
         if sym in self.sym2idx:
             return self.sym2idx[sym]
         else:
@@ -220,36 +230,19 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
             else:
                 raise ValueError('Token not in vocabulary and no <unk> token in vocabulary for replacement')
 
-    def convert_ids_to_tokens(self, indices):
-        """Converts a sequence of indices in symbols using the vocab."""
-        return [self.get_sym(idx) for idx in indices]
-
-    def convert_tokens_to_ids(self, symbols):
-        """Converts a sequence of symbols into ids using the vocab."""
-        return [self.get_idx(sym) for sym in symbols]
+    def _convert_ids_to_string(self, tokens_ids):
+        """Converts a sequence of ids in a string."""
+        out_string = ' '.join(tokens_ids).strip()
+        return out_string
 
     def convert_to_tensor(self, symbols):
         return torch.LongTensor(self.convert_tokens_to_ids(symbols))
 
-    def encode(self, text):
-        return self.convert_tokens_to_ids(self.tokenize(text))
-
-    def decode(self, indices, exclude=None, clean_up_tokenization_spaces=True):
-        """Converts a sequence of indices in a string."""
-        if exclude is None:
-            out_string = ' '.join([self.get_sym(idx) for idx in indices])
-        else:
-            out_string = ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude])
-
-        if clean_up_tokenization_spaces:
-            out_string = clean_up_tokenization(out_string)
-
-        return out_string
-
-    def __len__(self):
+    @property
+    def vocab_size(self):
         return len(self.idx2sym)
 
-    def tokenize(self, line, add_eos=False, add_double_eos=False):
+    def _tokenize(self, line, add_eos=False, add_double_eos=False):
         line = line.strip()
         # convert to lower case
         if self.lower_case:
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 9004315657..b191dd22e6 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -16,37 +16,145 @@
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 
-import sys
-import json
 import logging
 import os
-import regex as re
+import json
+import six
 from io import open
 
-try:
-    from functools import lru_cache
-except ImportError:
-    # Just a dummy decorator to get the checks to run on python2
-    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
-    def lru_cache():
-        return lambda func: func
-
 from .file_utils import cached_path
 
 logger = logging.getLogger(__name__)
 
+SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
+ADDED_TOKENS_FILE = 'added_tokens.json'
 
 class PreTrainedTokenizer(object):
-    """ An abstract class to handle dowloading and loading pretrained tokenizers.
+    """ An abstract class to handle dowloading and loading pretrained tokenizers and adding tokens to the vocabulary.
+
+        Derived class can set up a few special tokens to be used in common scripts and internals:
+            bos_token, eos_token, EOP_TOKEN, EOD_TOKEN, unk_token, sep_token, pad_token, cls_token, mask_token
+            additional_special_tokens = []
+
+        We defined an added_tokens_encoder to add new tokens to the vocabulary without having to handle the
+            specific vocabulary augmentation methods of the various underlying dictionnary structures (BPE, sentencepiece...).
     """
     vocab_files_names = {}
     pretrained_vocab_files_map = {}
     max_model_input_sizes = {}
 
+    SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token",
+                                 "pad_token", "cls_token", "mask_token",
+                                 "additional_special_tokens"]
+
+    @property
+    def bos_token(self):
+        if self._bos_token is None:
+            logger.error("Using bos_token, but it is not set yet.")
+        return self._bos_token
+
+    @property
+    def eos_token(self):
+        if self._eos_token is None:
+            logger.error("Using eos_token, but it is not set yet.")
+        return self._eos_token
+
+    @property
+    def unk_token(self):
+        if self._unk_token is None:
+            logger.error("Using unk_token, but it is not set yet.")
+        return self._unk_token
+
+    @property
+    def sep_token(self):
+        if self._sep_token is None:
+            logger.error("Using sep_token, but it is not set yet.")
+        return self._sep_token
+
+    @property
+    def pad_token(self):
+        if self._pad_token is None:
+            logger.error("Using pad_token, but it is not set yet.")
+        return self._pad_token
+
+    @property
+    def cls_token(self):
+        if self._cls_token is None:
+            logger.error("Using cls_token, but it is not set yet.")
+        return self._cls_token
+
+    @property
+    def mask_token(self):
+        if self._mask_token is None:
+            logger.error("Using mask_token, but it is not set yet.")
+        return self._mask_token
+
+    @property
+    def additional_special_tokens(self):
+        if self._additional_special_tokens is None:
+            logger.error("Using additional_special_tokens, but it is not set yet.")
+        return self._additional_special_tokens
+
+    @bos_token.setter
+    def bos_token(self, value):
+        self._bos_token = value
+
+    @eos_token.setter
+    def eos_token(self, value):
+        self._eos_token = value
+
+    @unk_token.setter
+    def unk_token(self, value):
+        self._unk_token = value
+
+    @sep_token.setter
+    def sep_token(self, value):
+        self._sep_token = value
+
+    @pad_token.setter
+    def pad_token(self, value):
+        self._pad_token = value
+
+    @cls_token.setter
+    def cls_token(self, value):
+        self._cls_token = value
+
+    @mask_token.setter
+    def mask_token(self, value):
+        self._mask_token = value
+
+    @additional_special_tokens.setter
+    def additional_special_tokens(self, value):
+        self._additional_special_tokens = value
+
+    def __init__(self, max_len=None, **kwargs):
+        self._bos_token = None
+        self._eos_token = None
+        self._unk_token = None
+        self._sep_token = None
+        self._pad_token = None
+        self._cls_token = None
+        self._mask_token = None
+        self._additional_special_tokens = []
+
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.added_tokens_encoder = {}
+        self.added_tokens_decoder = {}
+
+        for key, value in kwargs.items():
+            if key not in self.SPECIAL_TOKENS_ATTRIBUTES:
+                raise ValueError(
+                    "PreTrainedTokenizer.__init__() argument {} should be in {}".format(
+                        key, ', '.join(self.SPECIAL_TOKENS_ATTRIBUTES)))
+            else:
+                setattr(self, key, value)
+
+
     @classmethod
     def from_pretrained(cls, *inputs, **kwargs):
         return cls._from_pretrained(*inputs, **kwargs)
 
+
     @classmethod
     def _from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
         """
@@ -59,16 +167,20 @@ class PreTrainedTokenizer(object):
             for file_id, map_list in cls.pretrained_vocab_files_map.items():
                 vocab_files[file_id] = map_list[pretrained_model_name_or_path]
         else:
-            for file_id, file_name in cls.vocab_files_names.items():
+            all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
+                                     'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
+            all_vocab_files_names.update(cls.vocab_files_names)
+            for file_id, file_name in all_vocab_files_names.items():
                 if os.path.isdir(pretrained_model_name_or_path):
                     full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
                 else:
                     full_file_name = pretrained_model_name_or_path
                 if not os.path.exists(full_file_name):
-                    logger.info("Didn't find file {}. We don't load it.".format(full_file_name))
+                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
                     full_file_name = None
                 vocab_files[file_id] = full_file_name
-        # redirect to the cache, if necessary
+
+        # Get files from url, cache, or disk depending on the case
         try:
             resolved_vocab_files = {}
             for file_id, file_path in vocab_files.items():
@@ -95,6 +207,7 @@ class PreTrainedTokenizer(object):
                 logger.info("loading file {} from cache at {}".format(
                     file_path, resolved_vocab_files[file_id]))
 
+        # Set max length if needed
         if pretrained_model_name_or_path in cls.max_model_input_sizes:
             # if we're using a pretrained model, ensure the tokenizer
             # wont index sequences longer than the number of positional embeddings
@@ -102,31 +215,255 @@ class PreTrainedTokenizer(object):
             kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
 
         # Merge resolved_vocab_files arguments in kwargs.
+        added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None)
+        special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None)
         for args_name, file_path in resolved_vocab_files.items():
-            kwargs[args_name] = file_path
+            if args_name not in kwargs:
+                kwargs[args_name] = file_path
+        if special_tokens_map_file is not None:
+            special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
+            for key, value in special_tokens_map.items():
+                if key not in kwargs:
+                    kwargs[key] = value
 
         # Instantiate tokenizer.
         tokenizer = cls(*inputs, **kwargs)
 
+        # Add supplementary tokens.
+        if added_tokens_file is not None:
+            added_tokens = json.load(open(added_tokens_file, encoding="utf-8"))
+            added_tok_encoder = dict((tok, len(tokenizer) + i) for i, tok in enumerate(added_tokens))
+            added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
+            tokenizer.added_tokens_encoder.update(added_tok_encoder)
+            tokenizer.added_tokens_decoder.update(added_tok_decoder)
+
         return tokenizer
 
-    def tokenize(self, text):
+
+    def save_pretrained(self, save_directory):
+        """ Save the tokenizer vocabulary files (with added tokens) and the
+            special-tokens-to-class-attributes-mapping to a directory, so that it
+            can be re-loaded using the `from_pretrained(save_directory)` class method.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Saving directory ({}) should be a directory".format(save_directory))
+            return
+
+        special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
+        added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
+
+        with open(special_tokens_map_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
+
+        with open(added_tokens_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.added_tokens_decoder, ensure_ascii=False))
+
+        vocab_files = self.save_vocabulary(save_directory)
+
+        return vocab_files + (special_tokens_map_file, added_tokens_file)
+
+
+    def save_vocabulary(self, save_directory):
+        """ Save the tokenizer vocabulary to a directory. This method doesn't save added tokens
+            and special token mappings.
+            
+            Please use `save_pretrained()` to save the full Tokenizer state so that it can be
+            reloaded using the `from_pretrained(save_directory)` class method.
+        """
+        raise NotImplementedError
+
+
+    def vocab_size(self):
+        raise NotImplementedError
+
+
+    def __len__(self):
+        return self.vocab_size + len(self.added_tokens_encoder)
+
+
+    def add_tokens(self, new_tokens):
+        """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the
+            vocabulary, they are added to the added_tokens_encoder with indices starting from
+            the last index of the current vocabulary.
+
+            Returns:
+                Number of tokens added to the vocabulary which can be used to correspondingly
+                    increase the size of the associated model embedding matrices.
+        """
+        if not new_tokens:
+            return 0
+
+        to_add_tokens = []
+        for token in new_tokens:
+            if self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
+                to_add_tokens.append(token)
+                logger.info("Adding %s to the vocabulary", token)
+
+        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens))
+        added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
+        self.added_tokens_encoder.update(added_tok_encoder)
+        self.added_tokens_decoder.update(added_tok_decoder)
+
+        return len(to_add_tokens)
+
+
+    def add_special_tokens(self, special_tokens_dict):
+        """ Add a dictionnary of special tokens (eos, pad, cls...) to the encoder and link them
+            to class attributes. If the special tokens are not in the vocabulary, they are added
+            to it and indexed starting from the last index of the current vocabulary.
+
+            Returns:
+                Number of tokens added to the vocabulary which can be used to correspondingly
+                    increase the size of the associated model embedding matrices.
+        """
+        if not special_tokens_dict:
+            return 0
+
+        added_special_tokens = self.add_tokens(special_tokens_dict.values())
+        for key, value in special_tokens_dict.items():
+            logger.info("Assigning %s to the %s key of the tokenizer", value, key)
+            setattr(self, key, value)
+
+        return added_special_tokens
+
+
+    def tokenize(self, text, **kwargs):
+        """ Converts a string in a sequence of tokens (string), using the tokenizer.
+            Split in words for word-based vocabulary or sub-words for sub-word-based
+            vocabularies (BPE/SentencePieces/WordPieces).
+
+            Take care of added tokens.
+        """
+        def split_on_tokens(tok_list, text):
+            if not text:
+                return []
+            if not tok_list:
+                return self._tokenize(text, **kwargs)
+            tok = tok_list[0]
+            split_text = text.split(tok)
+            return sum((split_on_tokens(tok_list[1:], sub_text.strip()) + [tok] \
+                        for sub_text in split_text), [])[:-1]
+
+        added_tokens = list(self.added_tokens_encoder.keys())
+        tokenized_text = split_on_tokens(added_tokens, text)
+        return tokenized_text
+
+    def _tokenize(self, text, **kwargs):
+        """ Converts a string in a sequence of tokens (string), using the tokenizer.
+            Split in words for word-based vocabulary or sub-words for sub-word-based
+            vocabularies (BPE/SentencePieces/WordPieces).
+
+            Don't take care of added tokens.
+        """
         raise NotImplementedError
 
     def convert_tokens_to_ids(self, tokens):
+        """ Converts a single token or a sequence of tokens (str/unicode) in a integer id
+            (resp.) a sequence of ids, using the vocabulary.
+        """
+        if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
+            return self.convert_token_to_id_with_added_voc(tokens)
+
+        ids = []
+        for token in tokens:
+            ids.append(self.convert_token_to_id_with_added_voc(token))
+        if len(ids) > self.max_len:
+            logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
+                           "for this model ({} > {}). Running this sequence through the model will result in "
+                           "indexing errors".format(len(ids), self.max_len))
+        return ids
+
+
+    def convert_token_to_id_with_added_voc(self, token):
+        if token in self.added_tokens_encoder:
+            return self.added_tokens_encoder[token]
+        return self._convert_token_to_id(token)
+
+
+    def _convert_token_to_id(self, token):
         raise NotImplementedError
 
-    def convert_ids_to_tokens(self, ids):
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """ Converts a single index or a sequence of indices (integers) in a token "
+            (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
+
+            Args:
+                skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
+        """
+        if isinstance(ids, int):
+            return self.convert_id_to_token(ids)
+        tokens = []
+        for index in ids:
+            if index in self.all_special_ids and skip_special_tokens:
+                continue
+            if index in self.added_tokens_decoder:
+                tokens.append(self.added_tokens_decoder[index])
+            else:
+                tokens.append(self._convert_id_to_token(index))
+        return tokens
+
+
+    def _convert_id_to_token(self, index):
         raise NotImplementedError
 
+
     def encode(self, text):
-        raise NotImplementedError
+        """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
+            same as self.convert_tokens_to_ids(self.tokenize(text)).
+        """
+        return self.convert_tokens_to_ids(self.tokenize(text))
 
-    def decode(self, token_ids, *input, **kwargs):
-        raise NotImplementedError
 
-    def save_vocabulary(self, vocab_path):
-        raise NotImplementedError
+    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+        """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
+            with options to remove special tokens and clean up tokenization spaces.
+        """
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+        text = self._convert_ids_to_string(filtered_tokens)
+        if clean_up_tokenization_spaces:
+            text = clean_up_tokenization(text)
+        return text
+
+    def _convert_ids_to_string(self, tokens_ids):
+        """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary.
+            roughtly same as ' '.join(self.convert_ids_to_tokens(token_ids)).
+        """
+        return ' '.join(self.convert_ids_to_tokens(tokens_ids))
+
+    @property
+    def special_tokens_map(self):
+        """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their
+            values ('<unk>', '<cls>'...)
+        """
+        set_attr = {}
+        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
+            attr_value = getattr(self, "_" + attr)
+            if attr_value:
+                set_attr[attr] = attr_value
+        return set_attr
+
+    @property
+    def all_special_tokens(self):
+        """ List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes
+            (cls_token, unk_token...).
+        """
+        all_toks = []
+        set_attr = self.special_tokens_map
+        for attr_value in set_attr.values():
+            all_toks = all_toks + (attr_value if isinstance(attr_value, (list, tuple)) else [attr_value])
+        all_toks = list(set(all_toks))
+        return all_toks
+
+    @property
+    def all_special_ids(self):
+        """ List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to
+            class attributes (cls_token, unk_token...).
+        """
+        all_toks = self.all_special_tokens
+        all_ids = list(self.convert_tokens_to_ids(t) for t in all_toks)
+        return all_ids
+
 
 
 def clean_up_tokenization(out_string):
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index e37f3888a3..8a11a84f8c 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -34,7 +34,6 @@ logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {
     'vocab_file': 'vocab.json',
     'merges_file': 'merges.txt',
-    'special_tokens_file': 'special_tokens.txt'
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
@@ -46,24 +45,12 @@ PRETRAINED_VOCAB_FILES_MAP = {
     {
         'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
     },
-    'special_tokens_file':
-    {
-        'xlm-mlm-en-2048': None,
-    }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'xlm-mlm-en-2048': 512,
 }
 
-INDEX = {
-    "bos_index": 0,
-    "eos_index": 1,
-    "pad_index": 2,
-    "unk_index": 3,
-    "mask_index": 5
-}
-
 def get_pairs(word):
     """
     Return set of symbol pairs in a word.
@@ -103,7 +90,16 @@ class XLMTokenizer(PreTrainedTokenizer):
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, special_tokens_file=None, special_tokens=None, max_len=None):
+    def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
+                 sep_token="</s>", pad_token="<pad>", cls_token="</s>",
+                 mask_token="<special1>", additional_special_tokens=["<special0>",
+                 "<special1>", "<special2>", "<special3>", "<special4>", "<special5>",
+                 "<special6>", "<special7>", "<special8>", "<special9>"], **kwargs):
+        super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token,
+                                           sep_token=sep_token, pad_token=pad_token,
+                                           cls_token=cls_token, mask_token=mask_token,
+                                           additional_special_tokens=additional_special_tokens,
+                                           **kwargs)
         try:
             import ftfy
             import spacy
@@ -111,11 +107,9 @@ class XLMTokenizer(PreTrainedTokenizer):
             self.fix_text = ftfy.fix_text
         except ImportError:
             logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
-            self.nlp = BasicTokenizer(do_lower_case=True,
-                                      never_split=special_tokens if special_tokens is not None else [])
+            self.nlp = BasicTokenizer(do_lower_case=True)
             self.fix_text = None
 
-        self.max_len = max_len if max_len is not None else int(1e12)
         self.encoder = json.load(open(vocab_file, encoding="utf-8"))
         self.decoder = {v:k for k,v in self.encoder.items()}
         merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1]
@@ -123,35 +117,9 @@ class XLMTokenizer(PreTrainedTokenizer):
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
 
-        all_special_tokens = []
-        if special_tokens_file is not None:
-            special_tokens_to_add = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-            all_special_tokens.extend(special_tokens_to_add)
-        if special_tokens is not None and special_tokens:
-            all_special_tokens.extend(special_tokens)
-
-        self.special_tokens = {}
-        self.special_tokens_decoder = {}
-        self.set_special_tokens(all_special_tokens)
-
-    def __len__(self):
-        return len(self.encoder) + len(self.special_tokens)
-
-    def set_special_tokens(self, special_tokens):
-        """ Add a list of additional tokens to the encoder.
-            The additional tokens are indexed starting from the last index of the
-            current vocabulary in the order of the `special_tokens` list.
-        """
-        if not special_tokens:
-            self.special_tokens = {}
-            self.special_tokens_decoder = {}
-            return
-        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
-        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
-        if self.fix_text is None:
-            # Using BERT's BasicTokenizer: we can update the tokenizer
-            self.nlp.never_split = special_tokens
-        logger.info("Special tokens {}".format(self.special_tokens))
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
 
     def bpe(self, token):
         word = tuple(token[:-1]) + (token[-1] + '</w>',)
@@ -196,7 +164,7 @@ class XLMTokenizer(PreTrainedTokenizer):
         self.cache[token] = word
         return word
 
-    def tokenize(self, text):
+    def _tokenize(self, text):
         """ Tokenize a string. """
         split_tokens = []
         if self.fix_text is None:
@@ -211,58 +179,26 @@ class XLMTokenizer(PreTrainedTokenizer):
                 split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
         return split_tokens
 
-    def convert_tokens_to_ids(self, tokens):
-        """ Converts a sequence of tokens into ids using the vocab. """
-        ids = []
-        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
-            if tokens in self.special_tokens:
-                return self.special_tokens[tokens]
-            else:
-                return self.encoder.get(tokens, 0)
-        for token in tokens:
-            if token in self.special_tokens:
-                ids.append(self.special_tokens[token])
-            else:
-                ids.append(self.encoder.get(token, 0))
-        if len(ids) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this OpenAI GPT model ({} > {}). Running this"
-                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
-            )
-        return ids
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
 
-    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-        """Converts a sequence of ids in BPE tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            if i in self.special_tokens_decoder:
-                if not skip_special_tokens:
-                    tokens.append(self.special_tokens_decoder[i])
-            else:
-                tokens.append(self.decoder[i])
-        return tokens
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
 
-    def encode(self, text):
-        return self.convert_tokens_to_ids(self.tokenize(text))
-
-    def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+    def _convert_ids_to_string(self, tokens_ids):
         """Converts a sequence of ids in a string."""
-        tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
-        out_string = ''.join(tokens).replace('</w>', ' ').strip()
-        if clean_up_tokenization_spaces:
-            out_string = out_string.replace('<unk>', '')
-            out_string = clean_up_tokenization(out_string)
+        out_string = ''.join(tokens_ids).replace('</w>', ' ').strip()
         return out_string
 
-    def save_vocabulary(self, vocab_path):
+    def save_vocabulary(self, save_directory):
         """Save the tokenizer vocabulary and merge files to a directory."""
-        if not os.path.isdir(vocab_path):
-            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
-        merge_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['merges_file'])
-        special_tokens_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['special_tokens_file'])
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
 
         with open(vocab_file, 'w', encoding='utf-8') as f:
             f.write(json.dumps(self.encoder, ensure_ascii=False))
@@ -277,14 +213,4 @@ class XLMTokenizer(PreTrainedTokenizer):
                 writer.write(' '.join(bpe_tokens) + u'\n')
                 index += 1
 
-        index = len(self.encoder)
-        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
-            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
-                    index = token_index
-                writer.write(token + u'\n')
-                index += 1
-
-        return vocab_file, merge_file, special_tokens_file
+        return vocab_file, merge_file
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index a30e6db8da..942b532ec6 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -16,17 +16,13 @@
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 
-import json
 import logging
 import os
-import sys
 from shutil import copyfile
-from io import open
 
 import unicodedata
 import six
 
-from .file_utils import cached_path
 from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 
 logger = logging.getLogger(__name__)
@@ -44,8 +40,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'xlnet-large-cased': 512,
 }
 
-VOCAB_NAME = 'spiece.model'
-
 SPIECE_UNDERLINE = u'▁'
 
 # Segments (not really needed)
@@ -60,31 +54,26 @@ class XLNetTokenizer(PreTrainedTokenizer):
         SentencePiece based tokenizer. Peculiarities:
             - requires SentencePiece: https://github.com/google/sentencepiece
     """
-    # Tokens
-    special_symbols = {
-        "<unk>"  : 0,
-        "<s>"    : 1,
-        "</s>"   : 2,
-        "<cls>"  : 3,
-        "<sep>"  : 4,
-        "<pad>"  : 5,
-        "<mask>" : 6,
-        "<eod>"  : 7,
-        "<eop>"  : 8,
-    }
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
     def __init__(self, vocab_file, max_len=None,
-                 do_lower_case=False, remove_space=True, keep_accents=False):
+                 do_lower_case=False, remove_space=True, keep_accents=False,
+                 bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
+                 pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",
+                 additional_special_tokens=["<eop>", "<eod>"], **kwargs):
+        super(XLNetTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
+                                             unk_token=unk_token, sep_token=sep_token,
+                                             pad_token=pad_token, cls_token=cls_token,
+                                             mask_token=mask_token, additional_special_tokens=
+                                             additional_special_tokens, **kwargs)
         try:
             import sentencepiece as spm
         except ImportError:
             logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
                            "pip install sentencepiece")
 
-        self.max_len = max_len if max_len is not None else int(1e12)
         self.do_lower_case = do_lower_case
         self.remove_space = remove_space
         self.keep_accents = keep_accents
@@ -94,46 +83,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         self.sp_model.Load(vocab_file)
 
     @property
-    def UNK_TOKEN(self):
-        return "<unk>"
-
-    @property
-    def SEP_TOKEN(self):
-        return "<sep>"
-
-    @property
-    def PAD_TOKEN(self):
-        return "<pad>"
-
-    @property
-    def CLS_TOKEN(self):
-        return "<cls>"
-
-    @property
-    def MASK_TOKEN(self):
-        return "<mask>"
-
-    @property
-    def UNK_ID(self):
-        return self.special_symbols["<unk>"]
-
-    @property
-    def SEP_ID(self):
-        return self.special_symbols["<sep>"]
-
-    @property
-    def PAD_ID(self):
-        return self.special_symbols["<pad>"]
-
-    @property
-    def CLS_ID(self):
-        return self.special_symbols["<cls>"]
-
-    @property
-    def MASK_ID(self):
-        return self.special_symbols["<mask>"]
-
-    def __len__(self):
+    def vocab_size(self):
         return len(self.sp_model)
 
     def __getstate__(self):
@@ -169,7 +119,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
 
         return outputs
 
-    def tokenize(self, text, return_unicode=True, sample=False):
+    def _tokenize(self, text, return_unicode=True, sample=False):
         """ Tokenize a string.
             return_unicode is used only for py2
         """
@@ -208,56 +158,30 @@ class XLNetTokenizer(PreTrainedTokenizer):
 
         return new_pieces
 
-    def convert_tokens_to_ids(self, tokens, sample=False):
-        """ Converts a sequence of tokens into ids using the vocab. """
-        ids = []
-        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
-            return self.sp_model.PieceToId(tokens)
-        for token in tokens:
-            ids.append(self.sp_model.PieceToId(token))
-        if len(ids) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this XLNet model ({} > {}). Running this"
-                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
-            )
-        return ids
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.sp_model.PieceToId(token)
 
-    def convert_ids_to_tokens(self, ids, return_unicode=True):
-        """Converts a sequence of ids in tokens."""
-        tokens = []
-        for i in ids:
-            tokens.append(self.sp_model.IdToPiece(i))
+    def _convert_id_to_token(self, index, return_unicode=True):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        if six.PY2 and return_unicode and isinstance(token, str):
+            token = token.decode('utf-8')
+        return token
 
-        if six.PY2 and return_unicode:
-            ret_pieces = []
-            for piece in tokens:
-                if isinstance(piece, str):
-                    piece = piece.decode('utf-8')
-                ret_pieces.append(piece)
-            tokens = ret_pieces
-        return tokens
-
-    def encode(self, text, sample=False):
-        return self.convert_tokens_to_ids(self.tokenize(text, sample=sample))
-
-    def decode(self, ids, clean_up_tokenization_spaces=True):
+    def _convert_ids_to_string(self, tokens_ids):
         """Converts a sequence of ids in a string."""
-        tokens = self.convert_ids_to_tokens(ids)
-        out_string = ''.join(tokens)
-        if clean_up_tokenization_spaces:
-            out_string = out_string.strip().replace('<unk>', '')
-            out_string = clean_up_tokenization(out_string)
+        out_string = ''.join(tokens_ids)
         return out_string
 
-    def save_vocabulary(self, vocab_path):
+    def save_vocabulary(self, save_directory):
         """ Save the sentencepiece vocabulary (copy original file) and special tokens file
             to a directory.
         """
-        if not os.path.isdir(vocab_path):
-            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        out_vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
 
         copyfile(self.vocab_file, out_vocab_file)
 

From c079d7ddff7eeb653842f33f1f3fecd8b210e616 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 9 Jul 2019 10:40:59 +0200
Subject: [PATCH 12/17] fix python 2 tests

---
 .../tests/tokenization_bert_test.py           | 23 ++++++++-----------
 .../tests/tokenization_gpt2_test.py           |  5 ++--
 .../tests/tokenization_openai_test.py         |  5 ++--
 .../tests/tokenization_tests_commons.py       | 17 ++++++++++----
 .../tests/tokenization_transfo_xl_test.py     |  5 ++--
 .../tests/tokenization_xlm_test.py            |  5 ++--
 .../tests/tokenization_xlnet_test.py          |  7 +++---
 pytorch_transformers/tokenization_utils.py    |  9 +++++---
 8 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/pytorch_transformers/tests/tokenization_bert_test.py b/pytorch_transformers/tests/tokenization_bert_test.py
index 220bf45346..dbbe9ac5ea 100644
--- a/pytorch_transformers/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -24,7 +24,7 @@ from pytorch_transformers.tokenization_bert import (BasicTokenizer,
                                                     _is_control, _is_punctuation,
                                                     _is_whitespace, VOCAB_FILES_NAMES)
 
-from .tokenization_tests_commons import create_and_check_tokenizer_commons
+from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
 
 class TokenizationTest(unittest.TestCase):
 
@@ -33,21 +33,18 @@ class TokenizationTest(unittest.TestCase):
             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
             "##ing", ",", "low", "lowest",
         ]
-        vocab_directory = "/tmp/"
-        vocab_file = os.path.join(vocab_directory, VOCAB_FILES_NAMES['vocab_file'])
-        with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-            vocab_file = vocab_writer.name
+        with TemporaryDirectory() as tmpdirname:
+            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+            with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
+                vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
-        create_and_check_tokenizer_commons(self, BertTokenizer, pretrained_model_name_or_path=vocab_directory)
+            create_and_check_tokenizer_commons(self, BertTokenizer, tmpdirname)
 
-        tokenizer = BertTokenizer(vocab_file)
+            tokenizer = BertTokenizer(vocab_file)
 
-        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
-
-        os.remove(vocab_file)
+            tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
+            self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+            self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
     def test_chinese(self):
         tokenizer = BasicTokenizer()
diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
index 30959ceed1..8ae8896187 100644
--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -17,11 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
-import tempfile
 
 from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
 
-from .tokenization_tests_commons import create_and_check_tokenizer_commons
+from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
 
 class GPT2TokenizationTest(unittest.TestCase):
 
@@ -34,7 +33,7 @@ class GPT2TokenizationTest(unittest.TestCase):
         merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
         special_tokens_map = {"unk_token": "<unk>"}
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
             merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
             with open(vocab_file, "w") as fp:
diff --git a/pytorch_transformers/tests/tokenization_openai_test.py b/pytorch_transformers/tests/tokenization_openai_test.py
index 22f7d70017..f5c99877d7 100644
--- a/pytorch_transformers/tests/tokenization_openai_test.py
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -17,11 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
-import tempfile
 
 from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
 
-from.tokenization_tests_commons import create_and_check_tokenizer_commons
+from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
 
 
 class OpenAIGPTTokenizationTest(unittest.TestCase):
@@ -35,7 +34,7 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
             merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
             with open(vocab_file, "w") as fp:
diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index 07f962bcab..4e5fe83706 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -14,18 +14,25 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import os
 import sys
 from io import open
 import tempfile
-
-if sys.version_info[0] == 3:
-    unicode = str
+import shutil
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
+
+    class TemporaryDirectory(object):
+        """Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
+        def __enter__(self):
+            self.name = tempfile.mkdtemp()
+            return self.name
+        def __exit__(self, exc_type, exc_value, traceback):
+            shutil.rmtree(self.name)
 else:
     import pickle
+    TemporaryDirectory = tempfile.TemporaryDirectory
+    unicode = str
 
 
 def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
@@ -33,7 +40,7 @@ def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, *
 
     before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
 
-    with tempfile.TemporaryDirectory() as tmpdirname:
+    with TemporaryDirectory() as tmpdirname:
         tokenizer.save_pretrained(tmpdirname)
         tokenizer = tokenizer.from_pretrained(tmpdirname)
 
diff --git a/pytorch_transformers/tests/tokenization_transfo_xl_test.py b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
index a4ddd357b9..135f48b0ef 100644
--- a/pytorch_transformers/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -17,11 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 from io import open
-import tempfile
 
 from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
 
-from.tokenization_tests_commons import create_and_check_tokenizer_commons
+from.tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
 
 class TransfoXLTokenizationTest(unittest.TestCase):
 
@@ -30,7 +29,7 @@ class TransfoXLTokenizationTest(unittest.TestCase):
             "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
             "running", ",", "low", "l",
         ]
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
             with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
                 vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
diff --git a/pytorch_transformers/tests/tokenization_xlm_test.py b/pytorch_transformers/tests/tokenization_xlm_test.py
index b543ed23f8..827ec1606e 100644
--- a/pytorch_transformers/tests/tokenization_xlm_test.py
+++ b/pytorch_transformers/tests/tokenization_xlm_test.py
@@ -17,11 +17,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
-import tempfile
 
 from pytorch_transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
 
-from .tokenization_tests_commons import create_and_check_tokenizer_commons
+from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
 
 class XLMTokenizationTest(unittest.TestCase):
 
@@ -34,7 +33,7 @@ class XLMTokenizationTest(unittest.TestCase):
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
             merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
             with open(vocab_file, "w") as fp:
diff --git a/pytorch_transformers/tests/tokenization_xlnet_test.py b/pytorch_transformers/tests/tokenization_xlnet_test.py
index 8fc98209ba..e50fe9243d 100644
--- a/pytorch_transformers/tests/tokenization_xlnet_test.py
+++ b/pytorch_transformers/tests/tokenization_xlnet_test.py
@@ -16,11 +16,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import os
 import unittest
-import tempfile
 
-from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE, VOCAB_FILES_NAMES)
+from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
 
-from.tokenization_tests_commons import create_and_check_tokenizer_commons
+from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
 
 SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                     'fixtures/test_sentencepiece.model')
@@ -30,7 +29,7 @@ class XLNetTokenizationTest(unittest.TestCase):
     def test_full_tokenizer(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             tokenizer.save_pretrained(tmpdirname)
 
             create_and_check_tokenizer_commons(self, XLNetTokenizer, tmpdirname)
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index b191dd22e6..60081893c8 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -231,8 +231,7 @@ class PreTrainedTokenizer(object):
 
         # Add supplementary tokens.
         if added_tokens_file is not None:
-            added_tokens = json.load(open(added_tokens_file, encoding="utf-8"))
-            added_tok_encoder = dict((tok, len(tokenizer) + i) for i, tok in enumerate(added_tokens))
+            added_tok_encoder = json.load(open(added_tokens_file, encoding="utf-8"))
             added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
             tokenizer.added_tokens_encoder.update(added_tok_encoder)
             tokenizer.added_tokens_decoder.update(added_tok_decoder)
@@ -256,7 +255,11 @@ class PreTrainedTokenizer(object):
             f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
 
         with open(added_tokens_file, 'w', encoding='utf-8') as f:
-            f.write(json.dumps(self.added_tokens_decoder, ensure_ascii=False))
+            if self.added_tokens_encoder:
+                out_str = json.dumps(self.added_tokens_decoder, ensure_ascii=False)
+            else:
+                out_str = u"{}"
+            f.write(out_str)
 
         vocab_files = self.save_vocabulary(save_directory)
 

From d5481cbe1bf9f02036cd054f1e09f93266b56d07 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 9 Jul 2019 15:29:42 +0200
Subject: [PATCH 13/17] adding tests to examples - updating summary module -
 coverage update

---
 .coveragerc                                 |   3 +
 .gitignore                                  |   1 +
 examples/run_glue.py                        | 124 ++++++++++----------
 examples/test_examples.py                   |  25 ++--
 examples/tests_samples/.gitignore           |   5 +
 examples/tests_samples/MRPC/dev.tsv         |   7 ++
 examples/tests_samples/MRPC/train.tsv       |   7 ++
 pytorch_transformers/modeling_bert.py       |   1 -
 pytorch_transformers/modeling_gpt2.py       |   7 +-
 pytorch_transformers/modeling_openai.py     |   9 +-
 pytorch_transformers/modeling_transfo_xl.py |   1 -
 pytorch_transformers/modeling_utils.py      |  25 ++--
 pytorch_transformers/modeling_xlm.py        |  19 ++-
 pytorch_transformers/modeling_xlnet.py      |   7 +-
 pytorch_transformers/tokenization_bert.py   |   2 -
 pytorch_transformers/tokenization_utils.py  |   6 +-
 pytorch_transformers/tokenization_xlm.py    |   6 +-
 17 files changed, 139 insertions(+), 116 deletions(-)
 create mode 100644 examples/tests_samples/.gitignore
 create mode 100644 examples/tests_samples/MRPC/dev.tsv
 create mode 100644 examples/tests_samples/MRPC/train.tsv

diff --git a/.coveragerc b/.coveragerc
index 9b8c40ecf1..e0d5674aa0 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,8 @@
 [run]
 source=pytorch_transformers
+omit =
+    # skip convertion scripts from testing for now
+    */convert_*
 [report]
 exclude_lines =
     pragma: no cover
diff --git a/.gitignore b/.gitignore
index 05129fc402..6bbe32df6c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -126,4 +126,5 @@ models
 proc_data
 
 # examples
+runs
 examples/runs
\ No newline at end of file
diff --git a/examples/run_glue.py b/examples/run_glue.py
index 59583ed712..62d655ecc9 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -60,25 +60,14 @@ TOKENIZER_CLASSES = {
     'xlm': XLMTokenizer,
 }
 
-def train(args, train_features, model):
+def train(args, train_dataset, model):
     """ Train the model """
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
 
-    # Convert in tensors and build dataloader
-    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-    if args.output_mode == "classification":
-        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
-    elif args.output_mode == "regression":
-        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
-
     args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-    train_sampler = RandomSampler(train_data) if args.local_rank == -1 else DistributedSampler(train_data)
-    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
 
     num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
@@ -109,19 +98,24 @@ def train(args, train_features, model):
 
     # Train!
     logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_features))
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Batch size = %d", args.train_batch_size)
-    logger.info("  Num steps = %d", num_train_optimization_steps)
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", num_train_optimization_steps)
 
     global_step = 0
     tr_loss = 0
     model.train()
+    optimizer.zero_grad()
     for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
         for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
             batch = tuple(t.to(args.device) for t in batch)
-            input_ids, input_mask, segment_ids, label_ids = batch
-
-            ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
+            inputs = {'input_ids':      batch[0],
+                      'attention_mask': batch[1],
+                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,
+                      'labels':         batch[3]}
+            ouputs = model(**inputs)
             loss = ouputs[0]
 
             if args.n_gpu > 1:
@@ -150,30 +144,20 @@ def train(args, train_features, model):
     return global_step, tr_loss / global_step
 
 
-def evalutate(args, eval_task, eval_output_dir, eval_features, model):
+def evalutate(args, eval_task, eval_output_dir, dataset, model):
     """ Evaluate the model """
     if os.path.exists(eval_output_dir) and os.listdir(eval_output_dir) and args.do_train and not args.overwrite_output_dir:
         raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(eval_output_dir))
     if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
         os.makedirs(eval_output_dir)
 
-    # Convert in tensors and build dataloader
-    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-    if args.output_mode == "classification":
-        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
-    elif args.output_mode == "regression":
-        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
-
-    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
     # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
-    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
+    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
     # Eval!
     logger.info("***** Running evaluation *****")
-    logger.info("  Num examples = %d", len(eval_features))
+    logger.info("  Num examples = %d", len(dataset))
     logger.info("  Batch size = %d", args.eval_batch_size)
     model.eval()
     eval_loss = 0
@@ -214,36 +198,47 @@ def evalutate(args, eval_task, eval_output_dir, eval_features, model):
             logger.info("  %s = %s", key, str(result[key]))
             writer.write("%s = %s\n" % (key, str(result[key])))
 
+    return result
 
-def load_and_cache_examples(args, task, tokenizer, eval=False):
+
+def load_and_cache_examples(args, task, tokenizer, evaluate=False):
     processor = processors[task]()
     output_mode = output_modes[task]
-    label_list = processor.get_labels()
-
-    # Load and cache data
-    processor = processors[task]()
-    examples = processor.get_dev_examples(args.data_dir)
-    cached_features_file = os.path.join(args.data_dir, '{}_{}_{}_{}'.format(
-        'dev' if eval else 'train',
+    # Load data features from cache or dataset file
+    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
+        'dev' if evaluate else 'train',
         list(filter(None, args.model_name.split('/'))).pop(),
         str(args.max_seq_length),
         str(task)))
-
     if os.path.exists(cached_features_file):
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:
-        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode)
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+        label_list = processor.get_labels()
+        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
         features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
-            cls_token_at_end=bool(args.model_type not in ['bert', 'xlm']),
+            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
             cls_token=tokenizer.cls_token,
-            sep_token=tokenizer.sep_token, cls_token_segment_id=2,
-            pad_on_left=True, pad_token_segment_id=4)
-        if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+            sep_token=tokenizer.sep_token,
+            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1,
+            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
+            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
+        if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
 
-    return features
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    if output_mode == "classification":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
+
+    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    return dataset
 
 
 def main():
@@ -350,10 +345,10 @@ def main():
         torch.distributed.barrier()
 
     args.model_type = args.model_name.lower().split('-')[0]
-    args.tokenizer_class = TOKENIZER_CLASSES[args.model_type]
-    args.model_class = MODEL_CLASSES[args.model_type]
-    tokenizer = args.tokenizer_class.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)
-    model = args.model_class.from_pretrained(args.model_name, num_labels=num_labels)
+    tokenizer_class = TOKENIZER_CLASSES[args.model_type]
+    model_class = MODEL_CLASSES[args.model_type]
+    tokenizer = tokenizer_class.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name, num_labels=num_labels)
 
     if args.local_rank == 0:
         torch.distributed.barrier()
@@ -372,23 +367,30 @@ def main():
 
     # Training
     if args.do_train:
-        train_features = load_and_cache_examples(args, args.task_name, tokenizer, eval=False)
-        global_step, tr_loss = train(args, train_features, model)
+        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
+        global_step, tr_loss = train(args, train_dataset, model)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
 
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Save a trained model, configuration and tokenizer
+        # Create output directory if needed
+        if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+            raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
         model.save_pretrained(args.output_dir)
-        tokenizer.save_vocabulary(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model
         torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
 
         # Load a trained model and vocabulary that you have fine-tuned
-        model = args.model_class.from_pretrained(args.output_dir)
-        tokenizer = args.tokenizer_class.from_pretrained(args.output_dir)
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
         model.to(args.device)
 
     # Evaluation
@@ -398,9 +400,11 @@ def main():
         eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
 
         for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
-            eval_features = load_and_cache_examples(args, eval_task, tokenizer, eval=True)
+            eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
 
-            evalutate(args, eval_task, eval_output_dir, eval_features, model)
+            result = evalutate(args, eval_task, eval_output_dir, eval_dataset, model)
+
+        return result
 
 
 if __name__ == "__main__":
diff --git a/examples/test_examples.py b/examples/test_examples.py
index fada43dae2..8284858a12 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import sys
 import unittest
 import argparse
+import logging
 
 try:
     # python 3.4+ can use builtin unittest.mock instead of mock package
@@ -26,7 +27,11 @@ try:
 except ImportError:
     from mock import patch
 
-import run_bert_squad as rbs
+import run_glue
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
 
 def get_setup_file():
     parser = argparse.ArgumentParser()
@@ -36,12 +41,18 @@ def get_setup_file():
 
 class ExamplesTests(unittest.TestCase):
 
-    def test_run_squad(self):
-        testargs = ["prog", "-f", "/home/test/setup.py"]
-        with patch.object(sys, 'argv', testargs):
-            setup = get_setup_file()
-            assert setup == "/home/test/setup.py"
-            # rbs.main()
+    def test_run_glue(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        testargs = ["run_glue.py", "--data_dir=./examples/tests_samples/MRPC/",
+                    "--task_name=mrpc", "--do_train", "--do_eval", "--output_dir=./examples/tests_samples/temp_dir",
+                    "--train_batch_size=4", "--eval_batch_size=2", "--num_train_epochs=2.0", "--overwrite_output_dir"]
+        model_name = "--model_name=xlnet-large-cased"
+        with patch.object(sys, 'argv', testargs + [model_name]):
+            result = run_glue.main()
+            for value in result.values():
+                self.assertGreaterEqual(value, 0.75)
 
 
 if __name__ == "__main__":
diff --git a/examples/tests_samples/.gitignore b/examples/tests_samples/.gitignore
new file mode 100644
index 0000000000..1ac7520522
--- /dev/null
+++ b/examples/tests_samples/.gitignore
@@ -0,0 +1,5 @@
+*.*
+cache*
+temp*
+!*.tsv
+!.gitignore
\ No newline at end of file
diff --git a/examples/tests_samples/MRPC/dev.tsv b/examples/tests_samples/MRPC/dev.tsv
new file mode 100644
index 0000000000..5b814856c6
--- /dev/null
+++ b/examples/tests_samples/MRPC/dev.tsv
@@ -0,0 +1,7 @@
+﻿Quality	#1 ID	#2 ID	#1 String	#2 String
+1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
+0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
+0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
+1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
+1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
diff --git a/examples/tests_samples/MRPC/train.tsv b/examples/tests_samples/MRPC/train.tsv
new file mode 100644
index 0000000000..5b814856c6
--- /dev/null
+++ b/examples/tests_samples/MRPC/train.tsv
@@ -0,0 +1,7 @@
+﻿Quality	#1 ID	#2 ID	#1 String	#2 String
+1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
+0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
+0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
+1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
+1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 0dd72b2969..ea9502d2ef 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -28,7 +28,6 @@ import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .file_utils import cached_path
 from .modeling_utils import WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel, prune_linear_layer
 
 logger = logging.getLogger(__name__)
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 9340ce8489..7fefbefeae 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -30,7 +30,6 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .file_utils import cached_path
 from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
                           PreTrainedModel, prune_conv1d_layer, SequenceSummary)
 from .modeling_bert import BertLayerNorm as LayerNorm
@@ -122,9 +121,8 @@ class GPT2Config(PretrainedConfig):
         predict_special_tokens=True,
         summary_type='token_ids',
         summary_use_proj=True,
-        summary_num_classes=1,
         summary_activation=None,
-        summary_dropout=0.1,
+        summary_first_dropout=0.1,
         **kwargs
     ):
         """Constructs GPT2Config.
@@ -172,9 +170,8 @@ class GPT2Config(PretrainedConfig):
             self.predict_special_tokens = predict_special_tokens
             self.summary_type = summary_type
             self.summary_use_proj = summary_use_proj
-            self.summary_num_classes = summary_num_classes
             self.summary_activation = summary_activation
-            self.summary_dropout = summary_dropout
+            self.summary_first_dropout = summary_first_dropout
         else:
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 4a3ff732f6..c99df42035 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -30,9 +30,8 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .file_utils import cached_path
 from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
-                          PreTrainedModel, prune_conv1d_layer, SequenceSummary)
+                             PreTrainedModel, prune_conv1d_layer, SequenceSummary)
 from .modeling_bert import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
@@ -150,9 +149,8 @@ class OpenAIGPTConfig(PretrainedConfig):
         predict_special_tokens=True,
         summary_type='token_ids',
         summary_use_proj=True,
-        summary_num_classes=1,
         summary_activation=None,
-        summary_dropout=0.1,
+        summary_first_dropout=0.1,
         **kwargs
     ):
         """Constructs OpenAIGPTConfig.
@@ -203,9 +201,8 @@ class OpenAIGPTConfig(PretrainedConfig):
             self.predict_special_tokens = predict_special_tokens
             self.summary_type = summary_type
             self.summary_use_proj = summary_use_proj
-            self.summary_num_classes = summary_num_classes
             self.summary_activation = summary_activation
-            self.summary_dropout = summary_dropout
+            self.summary_first_dropout = summary_first_dropout
         else:
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 35a1b635f9..0c5d127d62 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -36,7 +36,6 @@ from torch.nn.parameter import Parameter
 
 from .modeling_bert import BertLayerNorm as LayerNorm
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
-from .file_utils import cached_path
 from .modeling_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
 
 logger = logging.getLogger(__name__)
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index b9be1a3813..36b506da3b 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -25,7 +25,7 @@ from io import open
 
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss, functional as F
+from torch.nn import CrossEntropyLoss, functional as F
 
 from .file_utils import cached_path
 
@@ -514,10 +514,10 @@ class SequenceSummary(nn.Module):
                 - 'token_ids' => supply a Tensor of classification token indices (GPT/GPT-2)
                 - 'attn' => Not implemented now, use multi-head attention
             summary_use_proj: Add a projection after the vector extraction
-            summary_num_classes: If > 0: the projection outputs to n classes (otherwise to hidden_size)
-            summary_activation:
-                'tanh' => add a tanh activation to the output
-                    None => no activation
+            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default 
+            summary_first_dropout: Add a dropout before the projection and activation
+            summary_last_dropout: Add a dropout after the projection and activation
     """
     def __init__(self, config):
         super(SequenceSummary, self).__init__()
@@ -531,8 +531,8 @@ class SequenceSummary(nn.Module):
 
         self.summary = nn.Identity()
         if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
-            if hasattr(config, 'summary_num_classes') and config.summary_num_classes > 0:
-                num_classes = config.summary_num_classes
+            if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
             else:
                 num_classes = config.hidden_size
             self.summary = nn.Linear(config.hidden_size, num_classes)
@@ -541,7 +541,13 @@ class SequenceSummary(nn.Module):
         if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
             self.activation = nn.Tanh()
 
-        self.dropout = nn.Dropout(config.summary_dropout)
+        self.first_dropout = nn.Identity()
+        if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = nn.Identity()
+        if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
 
     def forward(self, hidden_states, token_ids=None):
         """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
@@ -567,9 +573,10 @@ class SequenceSummary(nn.Module):
         elif self.summary_type == 'attn':
             raise NotImplementedError
 
+        output = self.first_dropout(output)
         output = self.summary(output)
         output = self.activation(output)
-        output = self.dropout(output)
+        output = self.last_dropout(output)
 
         return output
 
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index c7ea294dbd..65db9e7159 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -14,18 +14,14 @@
 # limitations under the License.
 """ PyTorch XLM model.
 """
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import json
 import logging
 import math
-import os
 import sys
 from io import open
 
-import math
 import itertools
 import numpy as np
 
@@ -34,9 +30,8 @@ from torch import nn
 from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .file_utils import cached_path
-from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
-                          prune_linear_layer, SequenceSummary, SQuADHead)
+from .modeling_utils import (PretrainedConfig, PreTrainedModel,
+                             prune_linear_layer, SequenceSummary, SQuADHead)
 
 logger = logging.getLogger(__name__)
 
@@ -79,10 +74,11 @@ class XLMConfig(PretrainedConfig):
 
                  finetuning_task=None,
                  num_labels=2,
-                 summary_type='last',
+                 summary_type='first',
                  summary_use_proj=True,
-                 summary_activation='tanh',
-                 summary_dropout=0.1,
+                 summary_activation=None,
+                 summary_proj_to_labels=True,
+                 summary_first_dropout=0.1,
                  start_n_top=5,
                  end_n_top=5,
                  **kwargs):
@@ -164,7 +160,8 @@ class XLMConfig(PretrainedConfig):
             self.summary_type = summary_type
             self.summary_use_proj = summary_use_proj
             self.summary_activation = summary_activation
-            self.summary_dropout = summary_dropout
+            self.summary_proj_to_labels = summary_proj_to_labels
+            self.summary_first_dropout = summary_first_dropout
             self.start_n_top = start_n_top
             self.end_n_top = end_n_top
         else:
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 628dbe7450..e0b3fb0661 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -31,9 +31,8 @@ from torch import nn
 from torch.nn import functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .file_utils import cached_path
 from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
-                          SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits)
+                             SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits)
 
 
 logger = logging.getLogger(__name__)
@@ -227,7 +226,7 @@ class XLNetConfig(PretrainedConfig):
                  summary_type='last',
                  summary_use_proj=True,
                  summary_activation='tanh',
-                 summary_dropout=0.1,
+                 summary_last_dropout=0.1,
                  start_n_top=5,
                  end_n_top=5,
                  **kwargs):
@@ -314,7 +313,7 @@ class XLNetConfig(PretrainedConfig):
             self.summary_type = summary_type
             self.summary_use_proj = summary_use_proj
             self.summary_activation = summary_activation
-            self.summary_dropout = summary_dropout
+            self.summary_last_dropout = summary_last_dropout
             self.start_n_top = start_n_top
             self.end_n_top = end_n_top
         else:
diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index 3e14673f46..1235d6f3cf 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -113,8 +113,6 @@ class BertTokenizer(PreTrainedTokenizer):
             raise ValueError(
                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                 "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
-        if never_split is None:
-            never_split = self.all_special_tokens
         self.vocab = load_vocab(vocab_file)
         self.ids_to_tokens = collections.OrderedDict(
             [(ids, tok) for tok, ids in self.vocab.items()])
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 60081893c8..a84b8d6f44 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -142,11 +142,7 @@ class PreTrainedTokenizer(object):
         self.added_tokens_decoder = {}
 
         for key, value in kwargs.items():
-            if key not in self.SPECIAL_TOKENS_ATTRIBUTES:
-                raise ValueError(
-                    "PreTrainedTokenizer.__init__() argument {} should be in {}".format(
-                        key, ', '.join(self.SPECIAL_TOKENS_ATTRIBUTES)))
-            else:
+            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                 setattr(self, key, value)
 
 
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 8a11a84f8c..8851455829 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -20,13 +20,9 @@ import json
 import logging
 import os
 import re
-import sys
 from io import open
 
-from tqdm import tqdm
-
-from .file_utils import cached_path
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_bert import BasicTokenizer
 
 logger = logging.getLogger(__name__)

From d0efbd3cd1fa268a3f5c5235237ceae0bde69776 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 9 Jul 2019 15:46:43 +0200
Subject: [PATCH 14/17] update sequencesummary module

---
 .coveragerc                                          | 1 +
 examples/test_examples.py                            | 2 +-
 pytorch_transformers/modeling_gpt2.py                | 6 ++++++
 pytorch_transformers/modeling_openai.py              | 6 ++++++
 pytorch_transformers/tests/modeling_tests_commons.py | 2 +-
 5 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/.coveragerc b/.coveragerc
index e0d5674aa0..fa6c165a8a 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -3,6 +3,7 @@ source=pytorch_transformers
 omit =
     # skip convertion scripts from testing for now
     */convert_*
+    */__main__.py
 [report]
 exclude_lines =
     pragma: no cover
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 8284858a12..56c30efae4 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -48,7 +48,7 @@ class ExamplesTests(unittest.TestCase):
         testargs = ["run_glue.py", "--data_dir=./examples/tests_samples/MRPC/",
                     "--task_name=mrpc", "--do_train", "--do_eval", "--output_dir=./examples/tests_samples/temp_dir",
                     "--train_batch_size=4", "--eval_batch_size=2", "--num_train_epochs=2.0", "--overwrite_output_dir"]
-        model_name = "--model_name=xlnet-large-cased"
+        model_name = "--model_name=bert-base-uncased"
         with patch.object(sys, 'argv', testargs + [model_name]):
             result = run_glue.main()
             for value in result.values():
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 7fefbefeae..840016098a 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -119,9 +119,12 @@ class GPT2Config(PretrainedConfig):
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
         predict_special_tokens=True,
+
+        num_labels=1,
         summary_type='token_ids',
         summary_use_proj=True,
         summary_activation=None,
+        summary_proj_to_labels=True,
         summary_first_dropout=0.1,
         **kwargs
     ):
@@ -168,10 +171,13 @@ class GPT2Config(PretrainedConfig):
             self.layer_norm_epsilon = layer_norm_epsilon
             self.initializer_range = initializer_range
             self.predict_special_tokens = predict_special_tokens
+
+            self.num_labels = num_labels
             self.summary_type = summary_type
             self.summary_use_proj = summary_use_proj
             self.summary_activation = summary_activation
             self.summary_first_dropout = summary_first_dropout
+            self.summary_proj_to_labels = summary_proj_to_labels
         else:
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index c99df42035..024ff8eb41 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -147,9 +147,12 @@ class OpenAIGPTConfig(PretrainedConfig):
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
         predict_special_tokens=True,
+
+        num_labels=1,
         summary_type='token_ids',
         summary_use_proj=True,
         summary_activation=None,
+        summary_proj_to_labels=True,
         summary_first_dropout=0.1,
         **kwargs
     ):
@@ -199,10 +202,13 @@ class OpenAIGPTConfig(PretrainedConfig):
             self.layer_norm_epsilon = layer_norm_epsilon
             self.initializer_range = initializer_range
             self.predict_special_tokens = predict_special_tokens
+
+            self.num_labels = num_labels
             self.summary_type = summary_type
             self.summary_use_proj = summary_use_proj
             self.summary_activation = summary_activation
             self.summary_first_dropout = summary_first_dropout
+            self.summary_proj_to_labels = summary_proj_to_labels
         else:
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
diff --git a/pytorch_transformers/tests/modeling_tests_commons.py b/pytorch_transformers/tests/modeling_tests_commons.py
index db79b017c1..5535177aaa 100644
--- a/pytorch_transformers/tests/modeling_tests_commons.py
+++ b/pytorch_transformers/tests/modeling_tests_commons.py
@@ -396,7 +396,7 @@ class GPTModelTester(object):
         model = self.double_head_model_class(config)
         model.eval()
         outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
-                                                    token_type_ids=token_type_ids, position_ids=position_ids)
+                        token_type_ids=token_type_ids, position_ids=position_ids)
         lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
         loss = [lm_loss, mc_loss]
 

From d743f2f34e11bdeefe144356fe89331c0f44fc36 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 9 Jul 2019 15:58:58 +0200
Subject: [PATCH 15/17] updating test

---
 .../tests/tokenization_tests_commons.py              | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index 4e5fe83706..44adbc6b53 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import os
 import sys
 from io import open
 import tempfile
@@ -49,15 +50,18 @@ def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, *
 
 def create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
     tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
+    tester.assertIsNotNone(tokenizer)
 
     text = u"Munich and Berlin are nice cities"
-    filename = u"/tmp/tokenizer.bin"
-
     subwords = tokenizer.tokenize(text)
 
-    pickle.dump(tokenizer, open(filename, "wb"))
+    with TemporaryDirectory() as tmpdirname:
+
+        filename = os.path.join(tmpdirname, u"tokenizer.bin")
+        pickle.dump(tokenizer, open(filename, "wb"))
+
+        tokenizer_new = pickle.load(open(filename, "rb"))
 
-    tokenizer_new = pickle.load(open(filename, "rb"))
     subwords_loaded = tokenizer_new.tokenize(text)
 
     tester.assertListEqual(subwords, subwords_loaded)

From 3b7cb7bf44f30b9fa0d1657a2d9f67a92b88abd4 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 9 Jul 2019 16:12:15 +0200
Subject: [PATCH 16/17] small update to run_glue

---
 examples/run_glue.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 62d655ecc9..6387ed448a 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -382,7 +382,8 @@ def main():
 
         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
         # They can then be reloaded using `from_pretrained()`
-        model.save_pretrained(args.output_dir)
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
         # Good practice: save your training arguments together with the trained model

From 4ce237c880e890b49092faac07bb757376f0e3f4 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 9 Jul 2019 17:00:32 +0200
Subject: [PATCH 17/17] update run_glue

---
 examples/run_glue.py | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 6387ed448a..ec76dadcfe 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -69,7 +69,11 @@ def train(args, train_dataset, model):
     train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
     train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
 
-    num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+    if args.max_steps > 0:
+        num_train_optimization_steps = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
     # Prepare optimizer
     param_optimizer = list(model.named_parameters())
@@ -91,10 +95,8 @@ def train(args, train_dataset, model):
         warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps)
 
     else:
-        optimizer = BertAdam(optimizer_grouped_parameters,
-                                lr=args.learning_rate,
-                                warmup=args.warmup_proportion,
-                                t_total=num_train_optimization_steps)
+        optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion,
+                             t_total=num_train_optimization_steps)
 
     # Train!
     logger.info("***** Running training *****")
@@ -113,7 +115,7 @@ def train(args, train_dataset, model):
             batch = tuple(t.to(args.device) for t in batch)
             inputs = {'input_ids':      batch[0],
                       'attention_mask': batch[1],
-                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,
+                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                       'labels':         batch[3]}
             ouputs = model(**inputs)
             loss = ouputs[0]
@@ -140,14 +142,16 @@ def train(args, train_dataset, model):
                     if not args.fp16:
                         tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
                     tb_writer.add_scalar('loss', loss.item(), global_step)
+            if args.max_steps > 0 and global_step > args.max_steps:
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            break
 
     return global_step, tr_loss / global_step
 
 
 def evalutate(args, eval_task, eval_output_dir, dataset, model):
     """ Evaluate the model """
-    if os.path.exists(eval_output_dir) and os.listdir(eval_output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(eval_output_dir))
     if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
         os.makedirs(eval_output_dir)
 
@@ -166,13 +170,13 @@ def evalutate(args, eval_task, eval_output_dir, dataset, model):
     out_label_ids = None
     for batch in tqdm(eval_dataloader, desc="Evaluating"):
         batch = tuple(t.to(args.device) for t in batch)
-        input_ids, input_mask, segment_ids, label_ids = batch
 
         with torch.no_grad():
-            outputs = model(input_ids,
-                            token_type_ids=segment_ids,
-                            attention_mask=input_mask,
-                            labels=label_ids)
+            inputs = {'input_ids':      batch[0],
+                      'attention_mask': batch[1],
+                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
+                      'labels':         batch[3]}
+            outputs = model(**inputs)
             tmp_eval_loss, logits = outputs[:2]
 
         eval_loss += tmp_eval_loss.mean().item()
@@ -276,6 +280,8 @@ def main():
                         help="The initial learning rate for Adam.")
     parser.add_argument("--num_train_epochs", default=3.0, type=float,
                         help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
     parser.add_argument("--warmup_proportion", default=0.1, type=float,
                         help="Proportion of training with linear learning rate warmup (0.1 = 10%% of training).")
     parser.add_argument("--no_cuda", action='store_true',
@@ -299,6 +305,9 @@ def main():
     parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
     args = parser.parse_args()
 
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+
     # Setup distant debugging if needed
     if args.server_ip and args.server_port:
         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
@@ -320,8 +329,8 @@ def main():
 
     # Setup logging
     logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        device, args.n_gpu, bool(args.local_rank != -1), args.fp16))
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
 
     # Setup seeds
     random.seed(args.seed)
@@ -375,8 +384,6 @@ def main():
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
-        if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-            raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
         if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
             os.makedirs(args.output_dir)