diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index 0ef8263748..c940549364 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -1,8 +1,11 @@
-__version__ = "0.4.0"
+__version__ = "0.5.0"
 from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+from .tokenization_openai import OpenAIGPTTokenizer
 from .modeling import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
                        BertForSequenceClassification, BertForMultipleChoice,
                        BertForTokenClassification, BertForQuestionAnswering)
+from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTDoubleHeadsModel
 from .optimization import BertAdam
+from .optimization_openai import OpenAIAdam
 from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
diff --git a/pytorch_pretrained_bert/__main__.py b/pytorch_pretrained_bert/__main__.py
index 79ad842932..1557adc63f 100644
--- a/pytorch_pretrained_bert/__main__.py
+++ b/pytorch_pretrained_bert/__main__.py
@@ -1,22 +1,40 @@
 # coding: utf8
 def main():
     import sys
-    try:
-        from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
-    except ModuleNotFoundError:
-        print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
-              "In that case, it requires TensorFlow to be installed. Please see "
-              "https://www.tensorflow.org/install/ for installation instructions.")
-        raise
-
-    if len(sys.argv) != 5:
-        # pylint: disable=line-too-long
-        print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+    if (len(sys.argv) != 4 and len(sys.argv) != 5) or sys.argv[1] not in [
+        "convert_tf_checkpoint_to_pytorch",
+        "convert_openai_checkpoint"
+    ]:
+        print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT` \n or `pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
     else:
-        PYTORCH_DUMP_OUTPUT = sys.argv.pop()
-        TF_CONFIG = sys.argv.pop()
-        TF_CHECKPOINT = sys.argv.pop()
-        convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
+        if sys.argv[1] == "convert_tf_checkpoint_to_pytorch":
+            try:
+                from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+            except ModuleNotFoundError:
+                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
+            if len(sys.argv) != 5:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+            else:
+                PYTORCH_DUMP_OUTPUT = sys.argv.pop()
+                TF_CONFIG = sys.argv.pop()
+                TF_CHECKPOINT = sys.argv.pop()
+                convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
+        else:
+            from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
+            OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
+            PYTORCH_DUMP_OUTPUT = sys.argv[3]
+            if len(sys.argv) == 5:
+                OPENAI_GPT_CONFIG = sys.argv[4]
+            else:
+                OPENAI_GPT_CONFIG = ""
+            convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
+                                                 OPENAI_GPT_CONFIG,
+                                                 PYTORCH_DUMP_OUTPUT)
 
 if __name__ == '__main__':
     main()
diff --git a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
index 59791450ee..2e25d16e61 100755
--- a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert BERT checkpoint."""
+"""Convert OpenAI GPT checkpoint."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,45 +20,53 @@ from __future__ import print_function
 
 import os
 import re
+import json
 import argparse
 import tensorflow as tf
 import torch
 import numpy as np
 
-from .modeling import BertConfig, BertForPreTraining
+from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel, CONFIG_NAME, WEIGHTS_NAME
 
 
-def convert_openai_checkpoint_to_pytorch(open_checkpoint_folder_path, openai_config_file, pytorch_dump_path):
-def load_openai_pretrained_model(model, n_ctx=-1, n_special=-1, n_transfer=12, n_embd=768, path='./model/',
-                                 path_names='./'):
+def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
     # Load weights from TF model
     print("Loading weights...")
-    names = json.load(open(path_names + 'parameters_names.json'))
-    shapes = json.load(open(path + 'params_shapes.json'))
+    names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
+    shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
     offsets = np.cumsum([np.prod(shape) for shape in shapes])
-    init_params = [np.load(path + 'params_{}.npy'.format(n)) for n in range(10)]
+    init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
     init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
     init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
-    if n_ctx > 0:
-        init_params[0] = init_params[0][:n_ctx]
-    if n_special > 0:
-        init_params[0] = np.concatenate(
-            [init_params[1],
-             (np.random.randn(n_special, n_embd) * 0.02).astype(np.float32),
-             init_params[0]
-             ], 0)
-    else:
-        init_params[0] = np.concatenate(
-            [init_params[1],
-             init_params[0]
-             ], 0)
+    # if n_ctx > 0:
+    #     init_params[0] = init_params[0][:n_ctx]
+    # if n_special > 0:
+    #     init_params[0] = np.concatenate(
+    #         [init_params[1],
+    #          (np.random.randn(n_special, n_embd) * 0.02).astype(np.float32),
+    #          init_params[0]
+    #          ], 0)
+    # else:
+    #     init_params[0] = np.concatenate(
+    #         [init_params[1],
+    #          init_params[0]
+    #          ], 0)
+    # del init_params[1]
+    # if n_transfer == -1:
+    #     n_transfer = 0
+    # else:
+    #     n_transfer = 1 + n_transfer * 12
+
+    init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
     del init_params[1]
-    if n_transfer == -1:
-        n_transfer = 0
-    else:
-        n_transfer = 1 + n_transfer * 12
     init_params = [arr.squeeze() for arr in init_params]
 
+    # Construct model
+    if openai_config_file == "":
+        config = OpenAIGPTConfig()
+    else:
+        config = OpenAIGPTConfig(openai_config_file)
+    model = OpenAIGPTModel(config)
     try:
         assert model.embed.weight.shape == init_params[0].shape
     except AssertionError as e:
@@ -66,8 +74,10 @@ def load_openai_pretrained_model(model, n_ctx=-1, n_special=-1, n_transfer=12, n
         raise
 
     model.embed.weight.data = torch.from_numpy(init_params[0])
+    names.pop(0)
+    init_params.pop(0)
 
-    for name, ip in zip(names[1:n_transfer], init_params[1:n_transfer]):
+    for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]):
         name = name[6:]  # skip "model/"
         assert name[-2:] == ":0"
         name = name[:-2]
@@ -78,64 +88,22 @@ def load_openai_pretrained_model(model, n_ctx=-1, n_special=-1, n_transfer=12, n
                 l = re.split(r'(\d+)', m_name)
             else:
                 l = [m_name]
-            pointer = getattr(pointer, l[0])
-            if len(l) >= 2:
-                num = int(l[1])
-                pointer = pointer[num]
-        try:
-            assert pointer.shape == ip.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, ip.shape)
-            raise
-        pointer.data = torch.from_numpy(ip)
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
-    config_path = os.path.abspath(bert_config_file)
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {} with config at {}".format(tf_path, config_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    # Initialise PyTorch model
-    config = BertConfig.from_json_file(bert_config_file)
-    print("Building PyTorch model from configuration: {}".format(str(config)))
-    model = BertForPreTraining(config)
-
-    for name, array in zip(names, arrays):
-        name = name.split('/')
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(n in ["adam_v", "adam_m"] for n in name):
-            print("Skipping {}".format("/".join(name)))
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
-            else:
-                l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
+            if l[0] == 'g':
                 pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
+            elif l[0] == 'b':
                 pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
+            elif l[0] == 'w':
                 pointer = getattr(pointer, 'weight')
             else:
                 pointer = getattr(pointer, l[0])
             if len(l) >= 2:
                 num = int(l[1])
                 pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
-            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
         try:
             assert pointer.shape == array.shape
         except AssertionError as e:
@@ -145,30 +113,33 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor
         pointer.data = torch.from_numpy(array)
 
     # Save pytorch-model
-    print("Save PyTorch model to {}".format(pytorch_dump_path))
-    torch.save(model.state_dict(), pytorch_dump_path)
-
+    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
+    torch.save(model.state_dict(), pytorch_weights_dump_path)
+    print("Save configuration file to {}".format(pytorch_config_dump_path))
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(config.to_json_string())
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--tf_checkpoint_path",
+    parser.add_argument("--openai_checkpoint_folder_path",
                         default = None,
                         type = str,
                         required = True,
                         help = "Path the TensorFlow checkpoint path.")
-    parser.add_argument("--bert_config_file",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "The config json file corresponding to the pre-trained BERT model. \n"
-                            "This specifies the model architecture.")
-    parser.add_argument("--pytorch_dump_path",
+    parser.add_argument("--pytorch_dump_folder_path",
                         default = None,
                         type = str,
                         required = True,
                         help = "Path to the output PyTorch model.")
+    parser.add_argument("--openai_config_file",
+                        default = "",
+                        type = str,
+                        help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
+                            "This specifies the model architecture.")
     args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.bert_config_file,
-                                     args.pytorch_dump_path)
+    convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
+                                         args.pytorch_dump_folder_path,
+                                         args.openai_config_file)
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 2d5967fb46..d2a0cf8dd2 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -416,12 +416,12 @@ class BertPreTrainingHeads(nn.Module):
         return prediction_scores, seq_relationship_score
 
 
-class PreTrainedModel(nn.Module):
+class BertPreTrainedModel(nn.Module):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
     def __init__(self, config, *inputs, **kwargs):
-        super(PreTrainedModel, self).__init__()
+        super(BertPreTrainedModel, self).__init__()
         if not isinstance(config, BertConfig):
             raise ValueError(
                 "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
@@ -447,7 +447,7 @@ class PreTrainedModel(nn.Module):
     @classmethod
     def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, *inputs, **kwargs):
         """
-        Instantiate a PreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
 
         Params:
@@ -547,13 +547,16 @@ class PreTrainedModel(nn.Module):
         if len(unexpected_keys) > 0:
             logger.info("Weights from pretrained model not used in {}: {}".format(
                 model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               self.__class__.__name__, "\n\t".join(error_msgs)))
         if tempdir:
             # Clean up temp dir
             shutil.rmtree(tempdir)
         return model
 
 
-class BertModel(PreTrainedModel):
+class BertModel(BertPreTrainedModel):
     """BERT model ("Bidirectional Embedding Representations from a Transformer").
 
     Params:
@@ -636,7 +639,7 @@ class BertModel(PreTrainedModel):
         return encoded_layers, pooled_output
 
 
-class BertForPreTraining(PreTrainedModel):
+class BertForPreTraining(BertPreTrainedModel):
     """BERT model with pre-training heads.
     This module comprises the BERT model followed by the two pre-training heads:
         - the masked language modeling head, and
@@ -707,7 +710,7 @@ class BertForPreTraining(PreTrainedModel):
             return prediction_scores, seq_relationship_score
 
 
-class BertForMaskedLM(PreTrainedModel):
+class BertForMaskedLM(BertPreTrainedModel):
     """BERT model with the masked language modeling head.
     This module comprises the BERT model followed by the masked language modeling head.
 
@@ -768,7 +771,7 @@ class BertForMaskedLM(PreTrainedModel):
             return prediction_scores
 
 
-class BertForNextSentencePrediction(PreTrainedModel):
+class BertForNextSentencePrediction(BertPreTrainedModel):
     """BERT model with next sentence prediction head.
     This module comprises the BERT model followed by the next sentence classification head.
 
@@ -830,7 +833,7 @@ class BertForNextSentencePrediction(PreTrainedModel):
             return seq_relationship_score
 
 
-class BertForSequenceClassification(PreTrainedModel):
+class BertForSequenceClassification(BertPreTrainedModel):
     """BERT model for classification.
     This module is composed of the BERT model with a linear layer on top of
     the pooled output.
@@ -875,7 +878,7 @@ class BertForSequenceClassification(PreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_labels=2):
+    def __init__(self, config, num_labels):
         super(BertForSequenceClassification, self).__init__(config)
         self.num_labels = num_labels
         self.bert = BertModel(config)
@@ -896,7 +899,7 @@ class BertForSequenceClassification(PreTrainedModel):
             return logits
 
 
-class BertForMultipleChoice(PreTrainedModel):
+class BertForMultipleChoice(BertPreTrainedModel):
     """BERT model for multiple choice tasks.
     This module is composed of the BERT model with a linear layer on top of
     the pooled output.
@@ -940,7 +943,7 @@ class BertForMultipleChoice(PreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_choices=2):
+    def __init__(self, config, num_choices):
         super(BertForMultipleChoice, self).__init__(config)
         self.num_choices = num_choices
         self.bert = BertModel(config)
@@ -965,7 +968,7 @@ class BertForMultipleChoice(PreTrainedModel):
             return reshaped_logits
 
 
-class BertForTokenClassification(PreTrainedModel):
+class BertForTokenClassification(BertPreTrainedModel):
     """BERT model for token-level classification.
     This module is composed of the BERT model with a linear layer on top of
     the full hidden state of the last layer.
@@ -1010,7 +1013,7 @@ class BertForTokenClassification(PreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_labels=2):
+    def __init__(self, config, num_labels):
         super(BertForTokenClassification, self).__init__(config)
         self.num_labels = num_labels
         self.bert = BertModel(config)
@@ -1031,7 +1034,7 @@ class BertForTokenClassification(PreTrainedModel):
             return logits
 
 
-class BertForQuestionAnswering(PreTrainedModel):
+class BertForQuestionAnswering(BertPreTrainedModel):
     """BERT model for Question Answering (span extraction).
     This module is composed of the BERT model with a linear layer on top of
     the sequence output that computes start_logits and end_logits
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 349baee79b..8e8ca0db00 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -1,16 +1,28 @@
+import os
 import copy
 import json
 import math
-import re
+import logging
+import tarfile
+import tempfile
+import shutil
 import collections
 
-import numpy as np
 import torch
 import torch.nn as nn
+from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
 from .modeling import BertLayerNorm as LayerNorm
+from .file_utils import cached_path
 
+logger = logging.getLogger(__name__)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt.tar.gz",
+}
+CONFIG_NAME = 'openai_gpt_config.json'
+WEIGHTS_NAME = 'pytorch_model.bin'
 
 def gelu(x):
     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
@@ -26,6 +38,237 @@ ACT_FNS = {
     'gelu': gelu
 }
 
+class OpenAIGPTConfig(object):
+    """Configuration class to store the configuration of a `OpenAIGPTModel`.
+    """
+    def __init__(self,
+                 vocab_size_or_config_json_file=40478,
+                 n_special=0,
+                 n_ctx=512,
+                 n_embd=768,
+                 n_layer=12,
+                 n_head=12,
+                 intermediate_size=3072,
+                 afn="gelu",
+                 resid_pdrop=0.1,
+                 embd_pdrop=0.1,
+                 attn_pdrop=0.1,
+                 type_vocab_size=2,
+                 initializer_range=0.02):
+        """Constructs OpenAIGPTConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
+            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
+            n_ctx: Number of positional embeddings.
+            n_embd: Dimensionality of the embeddings and hidden states.
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            afn: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            resid_pdrop: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attn_pdrop: The dropout ratio for the attention
+                probabilities.
+            embd_pdrop: The dropout ratio for the embeddings.
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `OpenAIGPTModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        if isinstance(vocab_size_or_config_json_file, str):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.n_special = n_special
+            self.n_ctx = n_ctx
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.afn = afn
+            self.intermediate_size = intermediate_size
+            self.resid_pdrop = resid_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.attn_pdrop = attn_pdrop
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             "or the path to a pretrained model config file (str)")
+
+    @property
+    def total_num_embeddings(self):
+        return self.vocab_size + self.n_special + self.n_ctx
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `OpenAIGPTConfig` from a Python dictionary of parameters."""
+        config = OpenAIGPTConfig(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `OpenAIGPTConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+class OpenAIGPTPreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(OpenAIGPTPreTrainedModel, self).__init__()
+        if not isinstance(config, OpenAIGPTConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `OpenAIGPTConfig`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        self.config = config
+
+    def init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def post_loading(self):
+        pass
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `openai-gpt`
+                - a path or url to a pretrained model archive containing:
+                    . `openai_gpt_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """
+        if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
+        else:
+            archive_file = pretrained_model_name
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except FileNotFoundError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                "associated to this path or url.".format(
+                    pretrained_model_name,
+                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                    archive_file))
+            return None
+        if resolved_archive_file == archive_file:
+            logger.info("loading archive file {}".format(archive_file))
+        else:
+            logger.info("loading archive file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+        tempdir = None
+        if os.path.isdir(resolved_archive_file):
+            serialization_dir = resolved_archive_file
+        else:
+            # Extract archive to temp dir
+            tempdir = tempfile.mkdtemp()
+            logger.info("extracting archive file {} to temp dir {}".format(
+                resolved_archive_file, tempdir))
+            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+                archive.extractall(tempdir)
+            serialization_dir = tempdir
+        # Load config
+        config_file = os.path.join(serialization_dir, CONFIG_NAME)
+        config = OpenAIGPTConfig.from_json_file(config_file)
+        logger.info("Model config {}".format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None:
+            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
+            state_dict = torch.load(weights_path)
+
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+        load(model.transformer if hasattr(model, 'transformer') else model, prefix='')
+        if len(missing_keys) > 0:
+            logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            logger.info("Weights from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               self.__class__.__name__, "\n\t".join(error_msgs)))
+        model.post_loading()
+        if tempdir:
+            # Clean up temp dir
+            shutil.rmtree(tempdir)
+        return model
 
 class Conv1D(nn.Module):
     def __init__(self, nf, rf, nx):
@@ -35,15 +278,15 @@ class Conv1D(nn.Module):
         if rf == 1:  # faster 1x1 conv
             w = torch.empty(nx, nf)
             nn.init.normal_(w, std=0.02)
-            self.w = Parameter(w)
-            self.b = Parameter(torch.zeros(nf))
+            self.weight = Parameter(w)
+            self.bias = Parameter(torch.zeros(nf))
         else:  # was used to train LM
             raise NotImplementedError
 
     def forward(self, x):
         if self.rf == 1:
             size_out = x.size()[:-1] + (self.nf,)
-            x = torch.addmm(self.b, x.view(-1, x.size(-1)), self.w)
+            x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
             x = x.view(*size_out)
         else:
             raise NotImplementedError
@@ -132,38 +375,18 @@ class Block(nn.Module):
         return h
 
 
-class TransformerModel(nn.Module):
-    """ Transformer model """
-
-    def __init__(self, cfg, vocab=40990, n_ctx=512):
-        super(TransformerModel, self).__init__()
-        self.vocab = vocab
-        self.embed = nn.Embedding(vocab, cfg.n_embd)
-        self.drop = nn.Dropout(cfg.embd_pdrop)
-        block = Block(n_ctx, cfg, scale=True)
-        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(cfg.n_layer)])
-
-        nn.init.normal_(self.embed.weight, std=0.02)
-
-    def forward(self, x):
-        x = x.view(-1, x.size(-2), x.size(-1))
-        e = self.embed(x)
-        # Add the position information to the input embeddings
-        h = e.sum(dim=2)
-        for block in self.h:
-            h = block(h)
-        return h
-
-
-class LMHead(nn.Module):
+class OpenAIGPTLMHead(nn.Module):
     """ Language Model Head for the transformer """
 
-    def __init__(self, model, cfg):
-        super(LMHead, self).__init__()
+    def __init__(self, model_embeddings_weights, cfg):
+        super(OpenAIGPTLMHead, self).__init__()
         self.n_embd = cfg.n_embd
-        embed_shape = model.embed.weight.shape
+        self.set_embeddings_weights(model_embeddings_weights)
+
+    def set_embeddings_weights(self, model_embeddings_weights):
+        embed_shape = model_embeddings_weights.shape
         self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
-        self.decoder.weight = model.embed.weight # Tied weights
+        self.decoder.weight = model_embeddings_weights # Tied weights
 
     def forward(self, h):
         # Truncated Language modeling logits (we remove the last token)
@@ -172,14 +395,14 @@ class LMHead(nn.Module):
         return lm_logits
 
 
-class MultipleChoiceHead(nn.Module):
+class OpenAIGPTClfHead(nn.Module):
     """ Classifier Head for the transformer """
 
     def __init__(self, clf_token, cfg):
-        super(MultipleChoiceHead, self).__init__()
+        super(OpenAIGPTClfHead, self).__init__()
         self.n_embd = cfg.n_embd
         self.clf_token = clf_token
-        self.dropout = nn.Dropout2d(cfg.clf_pdrop)  # To reproduce the noise_shape parameter of TF implementation
+        self.dropout = nn.Dropout2d(cfg.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
         self.linear = nn.Linear(cfg.n_embd, 1)
 
         nn.init.normal_(self.linear.weight, std = 0.02)
@@ -202,101 +425,71 @@ class MultipleChoiceHead(nn.Module):
         return clf_logits.view(-1, x.size(1))
 
 
-class ClfHead(nn.Module):
-    """Classification Head for the transformer
+class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
+    """ OpenAI GPT model """
 
-    TODO: test this class."""
-    def __init__(self, clf_token, cfg, n_class):
-        super(ClfHead, self).__init__()
-        self.n_embd = cfg.n_embd
-        self.clf_token = clf_token
-        self.dropout = nn.Dropout(cfg.clf_pdrop)
-        self.linear = nn.Linear(cfg.n_embd, n_class)
+    def __init__(self, cfg):
+        super(OpenAIGPTModel, self).__init__(cfg)
+        total_embeddings_size = cfg.vocab_size + cfg.n_special + cfg.n_ctx
+        self.embed = nn.Embedding(total_embeddings_size, cfg.n_embd)
+        self.drop = nn.Dropout(cfg.embd_pdrop)
+        block = Block(cfg.n_ctx, cfg, scale=True)
+        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(cfg.n_layer)])
 
-        nn.init.normal_(self.linear.weight, std = 0.02)
-        nn.init.normal_(self.linear.bias, 0)
+        self.apply(self.init_weights)
+        # nn.init.normal_(self.embed.weight, std=0.02)
 
-    def forward(self, h, x):
-        clf_h = h.view(-1, self.n_embd)
-        flat = x[..., 0].contiguous().view(-1)
-        clf_h = clf_h[flat == self.clf_token, :]
-        clf_h = self.dropout(clf_h)
-        clf_logits = self.linear(clf_h)
-
-        return clf_logits
-
-class SimilarityHead(nn.Module):
-    """ Similarity Head for the transformer
-
-        TODO: test this class."""
-    def __init__(self, clf_token, cfg):
-        super(SimilarityHead, self).__init__()
-        self.n_embd = cfg.n_embd
-        self.clf_token = clf_token
-        self.dropout = nn.Dropout(cfg.clf_pdrop)
-        self.linear = nn.Linear(cfg.n_embd, 1)
-
-        nn.init.normal_(self.linear.weight, std = 0.02)
-        nn.init.normal_(self.linear.bias, 0)
-
-    def forward(self, h, x):
-        sim_h = h.view(-1, self.n_embd)
-        flat = x[..., 0].contiguous().view(-1)
-        sim_h = sim_h[flat == self.clf_token, :]
-        sim_h = self.dropout(sim_h)
-        sim_h = sim_h.sum(dim = 1)
-        sim_logits = self.linear(sim_h)
-
-        return sim_logits
-
-class DoubleHeadModel(nn.Module):
-    """ Transformer with language model and task specific heads """
-    def __init__(self, cfg, clf_token, task_head_type, vocab=40990, n_ctx=512):
-        super(DoubleHeadModel, self).__init__()
-        self.transformer = TransformerModel(cfg, vocab=vocab, n_ctx=n_ctx)
-        self.lm_head = LMHead(self.transformer, cfg)
-        if isinstance(task_head_type, str):
-            if task_head_type == 'multiple_choice':
-                self.task_head = MultipleChoiceHead(clf_token, cfg)
-            elif task_head_type == 'similarity':
-                self.task_head = SimilarityHead(clf_token, cfg)
-            elif task_head_type == 'inference':
-                # the three classes correspond to entailment, contradiction and neutral.
-                self.task_head = ClfHead(clf_token, cfg, 3)
-            else:
-                raise ValueError("task_head_type is expected to be 'multiple_choice' "
-                                 "'similarity', 'inference' or ('classification', n_class) "
-                                 f"got {task_head_type}.")
-        elif isinstance(task_head_type, collections.abc.Sequence) and len(task_head_type) == 2 and \
-             task_head_type[0] == 'classification':
-            n_class = task_head_type[1]
-            self.task_head = ClfHead(clf_token, cfg, n_class)
-        else:
-            raise ValueError("task_head_type is expected to be 'multiple_choice' "
-                             "'similarity', 'inference' or ('classification', n_class) "
-                             f"got {task_head_type}.")
+    def set_num_special_tokens(self, num_special_tokens):
+        # Update config
+        self.config.n_special = num_special_tokens
+        # # Build new embeddings and initialize
+        old_embed = self.embed
+        self.embed = nn.Embedding(self.config.total_num_embeddings, self.config.n_embd)
+        # Initialize all new embeddings (in particular the special tokens)
+        self.init_weights(self.embed)
+        # Copy word and positional embeddings from the previous weights
+        self.embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
+        self.embed.weight.data[-self.config.n_ctx:, :] = old_embed.weight.data[-self.config.n_ctx:, :]
 
     def forward(self, x):
+        x = x.view(-1, x.size(-2), x.size(-1))
+        e = self.embed(x)
+        # Add the position information to the input embeddings
+        h = e.sum(dim=2)
+        for block in self.h:
+            h = block(h)
+        return h
+
+
+class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
+    """ OpenAI GPT model with language model and classification heads """
+    def __init__(self, cfg, clf_token='[CLS]'):
+        super(OpenAIGPTDoubleHeadsModel, self).__init__(cfg)
+        self.transformer = OpenAIGPTModel(cfg)
+        self.lm_head = OpenAIGPTLMHead(self.transformer.embed.weight, cfg)
+        self.clf_head = OpenAIGPTClfHead(clf_token, cfg)
+        self.apply(self.init_weights)
+
+    def post_loading(self):
+        " Set the number of special tokens to 1 (for the [CLS] token) "
+        self.set_num_special_tokens(1)
+
+    def set_num_special_tokens(self, num_special_tokens):
+        " Update input and output embeddings with new embedding matrice "
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
+
+    def forward(self, x, lm_labels=None, clf_labels=None):
         h = self.transformer(x)
         lm_logits = self.lm_head(h)
-        task_logits = self.task_head(h, x)
-
-        return lm_logits, task_logits
-
-
-class dotdict(dict):
-    """dot.notation access to dictionary attributes"""
-    __getattr__ = dict.get
-    __setattr__ = dict.__setitem__
-    __delattr__ = dict.__delitem__
-
-
-DEFAULT_CONFIG = dotdict({
-    'n_embd': 768,
-    'n_head': 12,
-    'n_layer': 12,
-    'embd_pdrop': 0.1,
-    'attn_pdrop': 0.1,
-    'resid_pdrop': 0.1,
-    'afn': 'gelu',
-    'clf_pdrop': 0.1})
+        clf_logits = self.clf_head(h, x)
+        losses = []
+        if lm_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            losses.append(loss_fct(lm_logits, lm_labels))
+        if clf_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            losses.append(loss_fct(clf_logits, clf_labels))
+        if losses:
+            return losses
+        return lm_logits, clf_logits
diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py
index 991d2699b3..4cc815c9ea 100644
--- a/pytorch_pretrained_bert/optimization_openai.py
+++ b/pytorch_pretrained_bert/optimization_openai.py
@@ -1,6 +1,23 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for OpenAI GPT model."""
+
 import math
 import torch
 from torch.optim import Optimizer
+from torch.optim.optimizer import required
 from torch.nn.utils import clip_grad_norm_
 
 def warmup_cosine(x, warmup=0.002):
@@ -25,26 +42,41 @@ SCHEDULES = {
 class OpenAIAdam(Optimizer):
     """Implements Open AI version of Adam algorithm with weight decay fix.
     """
-    def __init__(self, params, lr, schedule, warmup, t_total,
-                 b1=0.9, b2=0.999, e=1e-8, l2=0,
+    def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_total=-1,
+                 b1=0.9, b2=0.999, e=1e-8, weight_decay=0,
                  vector_l2=False, max_grad_norm=-1, **kwargs):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
         if schedule not in SCHEDULES:
             raise ValueError("Invalid schedule parameter: {}".format(schedule))
-        if not 0 <= warmup:
-            raise ValueError("Invalid warmup: {}".format(warmup))
+        if not 0.0 <= warmup < 1.0 and not warmup == -1:
+            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
         if not 0.0 <= b1 < 1.0:
             raise ValueError("Invalid b1 parameter: {}".format(b1))
         if not 0.0 <= b2 < 1.0:
             raise ValueError("Invalid b2 parameter: {}".format(b2))
-        if not 0.0 <= e:
+        if not e >= 0.0:
             raise ValueError("Invalid epsilon value: {}".format(e))
         defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
-                        b1=b1, b2=b2, e=e, l2=l2, vector_l2=vector_l2,
+                        b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
                         max_grad_norm=max_grad_norm)
         super(OpenAIAdam, self).__init__(params, defaults)
 
+    def get_lr(self):
+        lr = []
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                if len(state) == 0:
+                    return [0]
+                if group['t_total'] != -1:
+                    schedule_fct = SCHEDULES[group['schedule']]
+                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+                else:
+                    lr_scheduled = group['lr']
+                lr.append(lr_scheduled)
+        return lr
+
     def step(self, closure=None):
         """Performs a single optimization step.
 
@@ -91,14 +123,18 @@ class OpenAIAdam(Optimizer):
                 bias_correction1 = 1 - beta1 ** state['step']
                 bias_correction2 = 1 - beta2 ** state['step']
 
-                schedule_fct = SCHEDULES[group['schedule']]
-                lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+                if group['t_total'] != -1:
+                    schedule_fct = SCHEDULES[group['schedule']]
+                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+                else:
+                    lr_scheduled = group['lr']
+
                 step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
 
                 p.data.addcdiv_(-step_size, exp_avg, denom)
 
                 # Add weight decay at the end (fixed version)
-                if (len(p.size()) > 1 or group['vector_l2']) and group['l2'] > 0:
-                    p.data.add_(-lr_scheduled * group['l2'], p.data)
+                if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0:
+                    p.data.add_(-lr_scheduled * group['weight_decay'], p.data)
 
         return loss
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index 59d78f2f1e..dd0df83e93 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -1,9 +1,39 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+import os
 import re
-import ftfy
 import json
-import spacy
-
 from tqdm import tqdm
+import logging
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'openai-gpt': 512,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
 
 def get_pairs(word):
     """
@@ -32,16 +62,65 @@ def text_standardize(text):
     text = re.sub(r'[^\S\n]+', ' ', text)
     return text.strip()
 
-class TextEncoder(object):
+class OpenAIGPTTokenizer(object):
     """
     mostly a wrapper for a public python bpe tokenizer
     """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name]
+        else:
+            vocab_file = pretrained_model_name
+        if os.path.isdir(vocab_file):
+            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
+            merges_file = os.path.join(vocab_file, MERGES_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        except FileNotFoundError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                "associated to this path or url.".format(
+                    pretrained_model_name,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    vocab_file))
+            return None
+        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+            logger.info("loading merges file {}".format(merges_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+            logger.info("loading merges file {} from cache at {}".format(
+                merges_file, resolved_merges_file))
+        if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
+        return tokenizer
+
+    def __init__(self, vocab_file, merges_file):
+        try:
+            import ftfy
+            import spacy
+        except ImportError:
+            raise ImportError("Please install ftfy and spacy to use OpenAI GPT tokenizer.")
 
-    def __init__(self, encoder_path, bpe_path):
         self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
-        self.encoder = json.load(open(encoder_path))
+        self.encoder = json.load(open(vocab_file))
         self.decoder = {v:k for k,v in self.encoder.items()}
-        merges = open(bpe_path, encoding='utf-8').read().split('\n')[1:-1]
+        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
@@ -89,7 +168,7 @@ class TextEncoder(object):
         self.cache[token] = word
         return word
 
-    def encode(self, texts, verbose=True):
+    def tokenize(self, texts, verbose=True):
         texts_tokens = []
         if verbose:
             for text in tqdm(texts, ncols=80, leave=False):
diff --git a/setup.py b/setup.py
index e9b5c077c4..cf2f906100 100644
--- a/setup.py
+++ b/setup.py
@@ -37,8 +37,8 @@ from setuptools import find_packages, setup
 
 setup(
     name="pytorch_pretrained_bert",
-    version="0.4.0",
-    author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors",
+    version="0.5.0",
+    author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
     author_email="thomas@huggingface.co",
     description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
     long_description=open("README.md", "r", encoding='utf-8').read(),
@@ -55,7 +55,7 @@ setup(
                       'tqdm'],
     entry_points={
       'console_scripts': [
-        "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main"
+        "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main",
       ]
     },
     python_requires='>=3.5.0',