From 93f563b8a87d6928979206260dbc129aa10bae83 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 7 Jan 2019 12:55:36 +0100
Subject: [PATCH 01/82] adding OpenAI GPT

---
 .../convert_openai_checkpoint_to_pytorch.py   | 174 ++++++++++
 pytorch_pretrained_bert/modeling.py           |  22 +-
 pytorch_pretrained_bert/modeling_openai.py    | 302 ++++++++++++++++++
 .../optimization_openai.py                    | 104 ++++++
 .../tokenization_openai.py                    | 108 +++++++
 5 files changed, 699 insertions(+), 11 deletions(-)
 create mode 100755 pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
 create mode 100644 pytorch_pretrained_bert/modeling_openai.py
 create mode 100644 pytorch_pretrained_bert/optimization_openai.py
 create mode 100644 pytorch_pretrained_bert/tokenization_openai.py

diff --git a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
new file mode 100755
index 0000000000..59791450ee
--- /dev/null
+++ b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
@@ -0,0 +1,174 @@
+# coding=utf-8
+# Copyright 2018 The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert BERT checkpoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import re
+import argparse
+import tensorflow as tf
+import torch
+import numpy as np
+
+from .modeling import BertConfig, BertForPreTraining
+
+
+def convert_openai_checkpoint_to_pytorch(open_checkpoint_folder_path, openai_config_file, pytorch_dump_path):
+def load_openai_pretrained_model(model, n_ctx=-1, n_special=-1, n_transfer=12, n_embd=768, path='./model/',
+                                 path_names='./'):
+    # Load weights from TF model
+    print("Loading weights...")
+    names = json.load(open(path_names + 'parameters_names.json'))
+    shapes = json.load(open(path + 'params_shapes.json'))
+    offsets = np.cumsum([np.prod(shape) for shape in shapes])
+    init_params = [np.load(path + 'params_{}.npy'.format(n)) for n in range(10)]
+    init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
+    init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
+    if n_ctx > 0:
+        init_params[0] = init_params[0][:n_ctx]
+    if n_special > 0:
+        init_params[0] = np.concatenate(
+            [init_params[1],
+             (np.random.randn(n_special, n_embd) * 0.02).astype(np.float32),
+             init_params[0]
+             ], 0)
+    else:
+        init_params[0] = np.concatenate(
+            [init_params[1],
+             init_params[0]
+             ], 0)
+    del init_params[1]
+    if n_transfer == -1:
+        n_transfer = 0
+    else:
+        n_transfer = 1 + n_transfer * 12
+    init_params = [arr.squeeze() for arr in init_params]
+
+    try:
+        assert model.embed.weight.shape == init_params[0].shape
+    except AssertionError as e:
+        e.args += (model.embed.weight.shape, init_params[0].shape)
+        raise
+
+    model.embed.weight.data = torch.from_numpy(init_params[0])
+
+    for name, ip in zip(names[1:n_transfer], init_params[1:n_transfer]):
+        name = name[6:]  # skip "model/"
+        assert name[-2:] == ":0"
+        name = name[:-2]
+        name = name.split('/')
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
+                l = re.split(r'(\d+)', m_name)
+            else:
+                l = [m_name]
+            pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        try:
+            assert pointer.shape == ip.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, ip.shape)
+            raise
+        pointer.data = torch.from_numpy(ip)
+
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
+    config_path = os.path.abspath(bert_config_file)
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    print("Converting TensorFlow checkpoint from {} with config at {}".format(tf_path, config_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    # Initialise PyTorch model
+    config = BertConfig.from_json_file(bert_config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = BertForPreTraining(config)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m"] for n in name):
+            print("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    torch.save(model.state_dict(), pytorch_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--tf_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path the TensorFlow checkpoint path.")
+    parser.add_argument("--bert_config_file",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "The config json file corresponding to the pre-trained BERT model. \n"
+                            "This specifies the model architecture.")
+    parser.add_argument("--pytorch_dump_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    args = parser.parse_args()
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
+                                     args.bert_config_file,
+                                     args.pytorch_dump_path)
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index acdc741f6d..650918af7f 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -416,12 +416,12 @@ class BertPreTrainingHeads(nn.Module):
         return prediction_scores, seq_relationship_score
 
 
-class PreTrainedBertModel(nn.Module):
+class PreTrainedModel(nn.Module):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
     def __init__(self, config, *inputs, **kwargs):
-        super(PreTrainedBertModel, self).__init__()
+        super(PreTrainedModel, self).__init__()
         if not isinstance(config, BertConfig):
             raise ValueError(
                 "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
@@ -447,7 +447,7 @@ class PreTrainedBertModel(nn.Module):
     @classmethod
     def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, *inputs, **kwargs):
         """
-        Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict.
+        Instantiate a PreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
 
         Params:
@@ -551,7 +551,7 @@ class PreTrainedBertModel(nn.Module):
         return model
 
 
-class BertModel(PreTrainedBertModel):
+class BertModel(PreTrainedModel):
     """BERT model ("Bidirectional Embedding Representations from a Transformer").
 
     Params:
@@ -634,7 +634,7 @@ class BertModel(PreTrainedBertModel):
         return encoded_layers, pooled_output
 
 
-class BertForPreTraining(PreTrainedBertModel):
+class BertForPreTraining(PreTrainedModel):
     """BERT model with pre-training heads.
     This module comprises the BERT model followed by the two pre-training heads:
         - the masked language modeling head, and
@@ -705,7 +705,7 @@ class BertForPreTraining(PreTrainedBertModel):
             return prediction_scores, seq_relationship_score
 
 
-class BertForMaskedLM(PreTrainedBertModel):
+class BertForMaskedLM(PreTrainedModel):
     """BERT model with the masked language modeling head.
     This module comprises the BERT model followed by the masked language modeling head.
 
@@ -766,7 +766,7 @@ class BertForMaskedLM(PreTrainedBertModel):
             return prediction_scores
 
 
-class BertForNextSentencePrediction(PreTrainedBertModel):
+class BertForNextSentencePrediction(PreTrainedModel):
     """BERT model with next sentence prediction head.
     This module comprises the BERT model followed by the next sentence classification head.
 
@@ -828,7 +828,7 @@ class BertForNextSentencePrediction(PreTrainedBertModel):
             return seq_relationship_score
 
 
-class BertForSequenceClassification(PreTrainedBertModel):
+class BertForSequenceClassification(PreTrainedModel):
     """BERT model for classification.
     This module is composed of the BERT model with a linear layer on top of
     the pooled output.
@@ -894,7 +894,7 @@ class BertForSequenceClassification(PreTrainedBertModel):
             return logits
 
 
-class BertForMultipleChoice(PreTrainedBertModel):
+class BertForMultipleChoice(PreTrainedModel):
     """BERT model for multiple choice tasks.
     This module is composed of the BERT model with a linear layer on top of
     the pooled output.
@@ -963,7 +963,7 @@ class BertForMultipleChoice(PreTrainedBertModel):
             return reshaped_logits
 
 
-class BertForTokenClassification(PreTrainedBertModel):
+class BertForTokenClassification(PreTrainedModel):
     """BERT model for token-level classification.
     This module is composed of the BERT model with a linear layer on top of
     the full hidden state of the last layer.
@@ -1029,7 +1029,7 @@ class BertForTokenClassification(PreTrainedBertModel):
             return logits
 
 
-class BertForQuestionAnswering(PreTrainedBertModel):
+class BertForQuestionAnswering(PreTrainedModel):
     """BERT model for Question Answering (span extraction).
     This module is composed of the BERT model with a linear layer on top of
     the sequence output that computes start_logits and end_logits
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
new file mode 100644
index 0000000000..349baee79b
--- /dev/null
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -0,0 +1,302 @@
+import copy
+import json
+import math
+import re
+import collections
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.parameter import Parameter
+
+from .modeling import BertLayerNorm as LayerNorm
+
+
+def gelu(x):
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT_FNS = {
+    'relu': nn.ReLU,
+    'swish': swish,
+    'gelu': gelu
+}
+
+
+class Conv1D(nn.Module):
+    def __init__(self, nf, rf, nx):
+        super(Conv1D, self).__init__()
+        self.rf = rf
+        self.nf = nf
+        if rf == 1:  # faster 1x1 conv
+            w = torch.empty(nx, nf)
+            nn.init.normal_(w, std=0.02)
+            self.w = Parameter(w)
+            self.b = Parameter(torch.zeros(nf))
+        else:  # was used to train LM
+            raise NotImplementedError
+
+    def forward(self, x):
+        if self.rf == 1:
+            size_out = x.size()[:-1] + (self.nf,)
+            x = torch.addmm(self.b, x.view(-1, x.size(-1)), self.w)
+            x = x.view(*size_out)
+        else:
+            raise NotImplementedError
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, nx, n_ctx, cfg, scale=False):
+        super(Attention, self).__init__()
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        assert n_state % cfg.n_head == 0
+        self.register_buffer('b', torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+        self.n_head = cfg.n_head
+        self.split_size = n_state
+        self.scale = scale
+        self.c_attn = Conv1D(n_state * 3, 1, nx)
+        self.c_proj = Conv1D(n_state, 1, nx)
+        self.attn_dropout = nn.Dropout(cfg.attn_pdrop)
+        self.resid_dropout = nn.Dropout(cfg.resid_pdrop)
+
+    def _attn(self, q, k, v):
+        w = torch.matmul(q, k)
+        if self.scale:
+            w = w / math.sqrt(v.size(-1))
+        w = w * self.b + -1e9 * (1 - self.b)  # TF implem method: mask_attn_weights
+        w = nn.Softmax(dim=-1)(w)
+        w = self.attn_dropout(w)
+        return torch.matmul(w, v)
+
+    def merge_heads(self, x):
+        x = x.permute(0, 2, 1, 3).contiguous()
+        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
+        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
+
+    def split_heads(self, x, k=False):
+        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
+        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+        if k:
+            return x.permute(0, 2, 3, 1)
+        else:
+            return x.permute(0, 2, 1, 3)
+
+    def forward(self, x):
+        x = self.c_attn(x)
+        query, key, value = x.split(self.split_size, dim=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key, k=True)
+        value = self.split_heads(value)
+        a = self._attn(query, key, value)
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a)
+        return a
+
+
+class MLP(nn.Module):
+    def __init__(self, n_state, cfg):  # in MLP: n_state=3072 (4 * n_embd)
+        super(MLP, self).__init__()
+        nx = cfg.n_embd
+        self.c_fc = Conv1D(n_state, 1, nx)
+        self.c_proj = Conv1D(nx, 1, n_state)
+        self.act = ACT_FNS[cfg.afn]
+        self.dropout = nn.Dropout(cfg.resid_pdrop)
+
+    def forward(self, x):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        return self.dropout(h2)
+
+
+class Block(nn.Module):
+    def __init__(self, n_ctx, cfg, scale=False):
+        super(Block, self).__init__()
+        nx = cfg.n_embd
+        self.attn = Attention(nx, n_ctx, cfg, scale)
+        self.ln_1 = LayerNorm(nx)
+        self.mlp = MLP(4 * nx, cfg)
+        self.ln_2 = LayerNorm(nx)
+
+    def forward(self, x):
+        a = self.attn(x)
+        n = self.ln_1(x + a)
+        m = self.mlp(n)
+        h = self.ln_2(n + m)
+        return h
+
+
+class TransformerModel(nn.Module):
+    """ Transformer model """
+
+    def __init__(self, cfg, vocab=40990, n_ctx=512):
+        super(TransformerModel, self).__init__()
+        self.vocab = vocab
+        self.embed = nn.Embedding(vocab, cfg.n_embd)
+        self.drop = nn.Dropout(cfg.embd_pdrop)
+        block = Block(n_ctx, cfg, scale=True)
+        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(cfg.n_layer)])
+
+        nn.init.normal_(self.embed.weight, std=0.02)
+
+    def forward(self, x):
+        x = x.view(-1, x.size(-2), x.size(-1))
+        e = self.embed(x)
+        # Add the position information to the input embeddings
+        h = e.sum(dim=2)
+        for block in self.h:
+            h = block(h)
+        return h
+
+
+class LMHead(nn.Module):
+    """ Language Model Head for the transformer """
+
+    def __init__(self, model, cfg):
+        super(LMHead, self).__init__()
+        self.n_embd = cfg.n_embd
+        embed_shape = model.embed.weight.shape
+        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
+        self.decoder.weight = model.embed.weight # Tied weights
+
+    def forward(self, h):
+        # Truncated Language modeling logits (we remove the last token)
+        h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
+        lm_logits = self.decoder(h_trunc)
+        return lm_logits
+
+
+class MultipleChoiceHead(nn.Module):
+    """ Classifier Head for the transformer """
+
+    def __init__(self, clf_token, cfg):
+        super(MultipleChoiceHead, self).__init__()
+        self.n_embd = cfg.n_embd
+        self.clf_token = clf_token
+        self.dropout = nn.Dropout2d(cfg.clf_pdrop)  # To reproduce the noise_shape parameter of TF implementation
+        self.linear = nn.Linear(cfg.n_embd, 1)
+
+        nn.init.normal_(self.linear.weight, std = 0.02)
+        nn.init.normal_(self.linear.bias, 0)
+
+    def forward(self, h, x):
+        # Classification logits
+        clf_h = h.view(-1, self.n_embd)
+        flat = x[..., 0].contiguous().view(-1)
+        clf_h = clf_h[flat == self.clf_token, :]
+        clf_h = clf_h.view(-1, x.size(1), self.n_embd, 1)
+        # This double transposition is there to replicate the behavior
+        # of the noise_shape argument in the tensorflow
+        # implementation.  For more details, see
+        # https://github.com/huggingface/pytorch-openai-transformer-lm/issues/11
+        clf_h = self.dropout(clf_h.transpose(1, 2)).transpose(1, 2)
+        clf_h = clf_h.contiguous().view(-1, self.n_embd)
+        clf_logits = self.linear(clf_h)
+
+        return clf_logits.view(-1, x.size(1))
+
+
+class ClfHead(nn.Module):
+    """Classification Head for the transformer
+
+    TODO: test this class."""
+    def __init__(self, clf_token, cfg, n_class):
+        super(ClfHead, self).__init__()
+        self.n_embd = cfg.n_embd
+        self.clf_token = clf_token
+        self.dropout = nn.Dropout(cfg.clf_pdrop)
+        self.linear = nn.Linear(cfg.n_embd, n_class)
+
+        nn.init.normal_(self.linear.weight, std = 0.02)
+        nn.init.normal_(self.linear.bias, 0)
+
+    def forward(self, h, x):
+        clf_h = h.view(-1, self.n_embd)
+        flat = x[..., 0].contiguous().view(-1)
+        clf_h = clf_h[flat == self.clf_token, :]
+        clf_h = self.dropout(clf_h)
+        clf_logits = self.linear(clf_h)
+
+        return clf_logits
+
+class SimilarityHead(nn.Module):
+    """ Similarity Head for the transformer
+
+        TODO: test this class."""
+    def __init__(self, clf_token, cfg):
+        super(SimilarityHead, self).__init__()
+        self.n_embd = cfg.n_embd
+        self.clf_token = clf_token
+        self.dropout = nn.Dropout(cfg.clf_pdrop)
+        self.linear = nn.Linear(cfg.n_embd, 1)
+
+        nn.init.normal_(self.linear.weight, std = 0.02)
+        nn.init.normal_(self.linear.bias, 0)
+
+    def forward(self, h, x):
+        sim_h = h.view(-1, self.n_embd)
+        flat = x[..., 0].contiguous().view(-1)
+        sim_h = sim_h[flat == self.clf_token, :]
+        sim_h = self.dropout(sim_h)
+        sim_h = sim_h.sum(dim = 1)
+        sim_logits = self.linear(sim_h)
+
+        return sim_logits
+
+class DoubleHeadModel(nn.Module):
+    """ Transformer with language model and task specific heads """
+    def __init__(self, cfg, clf_token, task_head_type, vocab=40990, n_ctx=512):
+        super(DoubleHeadModel, self).__init__()
+        self.transformer = TransformerModel(cfg, vocab=vocab, n_ctx=n_ctx)
+        self.lm_head = LMHead(self.transformer, cfg)
+        if isinstance(task_head_type, str):
+            if task_head_type == 'multiple_choice':
+                self.task_head = MultipleChoiceHead(clf_token, cfg)
+            elif task_head_type == 'similarity':
+                self.task_head = SimilarityHead(clf_token, cfg)
+            elif task_head_type == 'inference':
+                # the three classes correspond to entailment, contradiction and neutral.
+                self.task_head = ClfHead(clf_token, cfg, 3)
+            else:
+                raise ValueError("task_head_type is expected to be 'multiple_choice' "
+                                 "'similarity', 'inference' or ('classification', n_class) "
+                                 f"got {task_head_type}.")
+        elif isinstance(task_head_type, collections.abc.Sequence) and len(task_head_type) == 2 and \
+             task_head_type[0] == 'classification':
+            n_class = task_head_type[1]
+            self.task_head = ClfHead(clf_token, cfg, n_class)
+        else:
+            raise ValueError("task_head_type is expected to be 'multiple_choice' "
+                             "'similarity', 'inference' or ('classification', n_class) "
+                             f"got {task_head_type}.")
+
+    def forward(self, x):
+        h = self.transformer(x)
+        lm_logits = self.lm_head(h)
+        task_logits = self.task_head(h, x)
+
+        return lm_logits, task_logits
+
+
+class dotdict(dict):
+    """dot.notation access to dictionary attributes"""
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+
+
+DEFAULT_CONFIG = dotdict({
+    'n_embd': 768,
+    'n_head': 12,
+    'n_layer': 12,
+    'embd_pdrop': 0.1,
+    'attn_pdrop': 0.1,
+    'resid_pdrop': 0.1,
+    'afn': 'gelu',
+    'clf_pdrop': 0.1})
diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py
new file mode 100644
index 0000000000..991d2699b3
--- /dev/null
+++ b/pytorch_pretrained_bert/optimization_openai.py
@@ -0,0 +1,104 @@
+import math
+import torch
+from torch.optim import Optimizer
+from torch.nn.utils import clip_grad_norm_
+
+def warmup_cosine(x, warmup=0.002):
+    s = 1 if x <= warmup else 0
+    return s*(x/warmup) + (1-s)*(0.5 * (1 + torch.cos(math.pi * x)))
+
+def warmup_constant(x, warmup=0.002):
+    s = 1 if x <= warmup else 0
+    return s*(x/warmup) + (1-s)*1
+
+def warmup_linear(x, warmup=0.002):
+    s = 1 if x <= warmup else 0
+    return (s*(x/warmup) + (1-s))*(1-x)
+
+SCHEDULES = {
+    'warmup_cosine':warmup_cosine,
+    'warmup_constant':warmup_constant,
+    'warmup_linear':warmup_linear,
+}
+
+
+class OpenAIAdam(Optimizer):
+    """Implements Open AI version of Adam algorithm with weight decay fix.
+    """
+    def __init__(self, params, lr, schedule, warmup, t_total,
+                 b1=0.9, b2=0.999, e=1e-8, l2=0,
+                 vector_l2=False, max_grad_norm=-1, **kwargs):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if schedule not in SCHEDULES:
+            raise ValueError("Invalid schedule parameter: {}".format(schedule))
+        if not 0 <= warmup:
+            raise ValueError("Invalid warmup: {}".format(warmup))
+        if not 0.0 <= b1 < 1.0:
+            raise ValueError("Invalid b1 parameter: {}".format(b1))
+        if not 0.0 <= b2 < 1.0:
+            raise ValueError("Invalid b2 parameter: {}".format(b2))
+        if not 0.0 <= e:
+            raise ValueError("Invalid epsilon value: {}".format(e))
+        defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
+                        b1=b1, b2=b2, e=e, l2=l2, vector_l2=vector_l2,
+                        max_grad_norm=max_grad_norm)
+        super(OpenAIAdam, self).__init__(params, defaults)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['b1'], group['b2']
+
+                state['step'] += 1
+
+                # Add grad clipping
+                if group['max_grad_norm'] > 0:
+                    clip_grad_norm_(p, group['max_grad_norm'])
+
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                denom = exp_avg_sq.sqrt().add_(group['e'])
+
+                bias_correction1 = 1 - beta1 ** state['step']
+                bias_correction2 = 1 - beta2 ** state['step']
+
+                schedule_fct = SCHEDULES[group['schedule']]
+                lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+                step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
+
+                p.data.addcdiv_(-step_size, exp_avg, denom)
+
+                # Add weight decay at the end (fixed version)
+                if (len(p.size()) > 1 or group['vector_l2']) and group['l2'] > 0:
+                    p.data.add_(-lr_scheduled * group['l2'], p.data)
+
+        return loss
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
new file mode 100644
index 0000000000..59d78f2f1e
--- /dev/null
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -0,0 +1,108 @@
+import re
+import ftfy
+import json
+import spacy
+
+from tqdm import tqdm
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+    word is represented as tuple of symbols (symbols being variable-length strings)
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+def text_standardize(text):
+    """
+    fixes some issues the spacy tokenizer had on books corpus
+    also does some whitespace standardization
+    """
+    text = text.replace('—', '-')
+    text = text.replace('–', '-')
+    text = text.replace('―', '-')
+    text = text.replace('…', '...')
+    text = text.replace('´', "'")
+    text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
+    text = re.sub(r'\s*\n\s*', ' \n ', text)
+    text = re.sub(r'[^\S\n]+', ' ', text)
+    return text.strip()
+
+class TextEncoder(object):
+    """
+    mostly a wrapper for a public python bpe tokenizer
+    """
+
+    def __init__(self, encoder_path, bpe_path):
+        self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
+        self.encoder = json.load(open(encoder_path))
+        self.decoder = {v:k for k,v in self.encoder.items()}
+        merges = open(bpe_path, encoding='utf-8').read().split('\n')[1:-1]
+        merges = [tuple(merge.split()) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+
+    def bpe(self, token):
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        if token in self.cache:
+            return self.cache[token]
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token+'</w>'
+
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        if word == '\n  </w>':
+            word = '\n</w>'
+        self.cache[token] = word
+        return word
+
+    def encode(self, texts, verbose=True):
+        texts_tokens = []
+        if verbose:
+            for text in tqdm(texts, ncols=80, leave=False):
+                text = self.nlp(text_standardize(ftfy.fix_text(text)))
+                text_tokens = []
+                for token in text:
+                    text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
+                texts_tokens.append(text_tokens)
+        else:
+            for text in texts:
+                text = self.nlp(text_standardize(ftfy.fix_text(text)))
+                text_tokens = []
+                for token in text:
+                    text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
+                texts_tokens.append(text_tokens)
+        return texts_tokens

From eed51c5bdf0d26159127f82f0fe95265b076e1af Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 8 Jan 2019 12:26:58 +0100
Subject: [PATCH 02/82] add OpenAI GPT

---
 pytorch_pretrained_bert/__init__.py           |   5 +-
 pytorch_pretrained_bert/__main__.py           |  48 +-
 .../convert_openai_checkpoint_to_pytorch.py   | 151 +++---
 pytorch_pretrained_bert/modeling.py           |  31 +-
 pytorch_pretrained_bert/modeling_openai.py    | 447 +++++++++++++-----
 .../optimization_openai.py                    |  60 ++-
 .../tokenization_openai.py                    |  95 +++-
 setup.py                                      |   6 +-
 8 files changed, 573 insertions(+), 270 deletions(-)

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index 0ef8263748..c940549364 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -1,8 +1,11 @@
-__version__ = "0.4.0"
+__version__ = "0.5.0"
 from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+from .tokenization_openai import OpenAIGPTTokenizer
 from .modeling import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
                        BertForSequenceClassification, BertForMultipleChoice,
                        BertForTokenClassification, BertForQuestionAnswering)
+from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTDoubleHeadsModel
 from .optimization import BertAdam
+from .optimization_openai import OpenAIAdam
 from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
diff --git a/pytorch_pretrained_bert/__main__.py b/pytorch_pretrained_bert/__main__.py
index 79ad842932..1557adc63f 100644
--- a/pytorch_pretrained_bert/__main__.py
+++ b/pytorch_pretrained_bert/__main__.py
@@ -1,22 +1,40 @@
 # coding: utf8
 def main():
     import sys
-    try:
-        from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
-    except ModuleNotFoundError:
-        print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
-              "In that case, it requires TensorFlow to be installed. Please see "
-              "https://www.tensorflow.org/install/ for installation instructions.")
-        raise
-
-    if len(sys.argv) != 5:
-        # pylint: disable=line-too-long
-        print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+    if (len(sys.argv) != 4 and len(sys.argv) != 5) or sys.argv[1] not in [
+        "convert_tf_checkpoint_to_pytorch",
+        "convert_openai_checkpoint"
+    ]:
+        print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT` \n or `pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
     else:
-        PYTORCH_DUMP_OUTPUT = sys.argv.pop()
-        TF_CONFIG = sys.argv.pop()
-        TF_CHECKPOINT = sys.argv.pop()
-        convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
+        if sys.argv[1] == "convert_tf_checkpoint_to_pytorch":
+            try:
+                from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+            except ModuleNotFoundError:
+                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
+            if len(sys.argv) != 5:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+            else:
+                PYTORCH_DUMP_OUTPUT = sys.argv.pop()
+                TF_CONFIG = sys.argv.pop()
+                TF_CHECKPOINT = sys.argv.pop()
+                convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
+        else:
+            from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
+            OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
+            PYTORCH_DUMP_OUTPUT = sys.argv[3]
+            if len(sys.argv) == 5:
+                OPENAI_GPT_CONFIG = sys.argv[4]
+            else:
+                OPENAI_GPT_CONFIG = ""
+            convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
+                                                 OPENAI_GPT_CONFIG,
+                                                 PYTORCH_DUMP_OUTPUT)
 
 if __name__ == '__main__':
     main()
diff --git a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
index 59791450ee..2e25d16e61 100755
--- a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert BERT checkpoint."""
+"""Convert OpenAI GPT checkpoint."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,45 +20,53 @@ from __future__ import print_function
 
 import os
 import re
+import json
 import argparse
 import tensorflow as tf
 import torch
 import numpy as np
 
-from .modeling import BertConfig, BertForPreTraining
+from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel, CONFIG_NAME, WEIGHTS_NAME
 
 
-def convert_openai_checkpoint_to_pytorch(open_checkpoint_folder_path, openai_config_file, pytorch_dump_path):
-def load_openai_pretrained_model(model, n_ctx=-1, n_special=-1, n_transfer=12, n_embd=768, path='./model/',
-                                 path_names='./'):
+def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
     # Load weights from TF model
     print("Loading weights...")
-    names = json.load(open(path_names + 'parameters_names.json'))
-    shapes = json.load(open(path + 'params_shapes.json'))
+    names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
+    shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
     offsets = np.cumsum([np.prod(shape) for shape in shapes])
-    init_params = [np.load(path + 'params_{}.npy'.format(n)) for n in range(10)]
+    init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
     init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
     init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
-    if n_ctx > 0:
-        init_params[0] = init_params[0][:n_ctx]
-    if n_special > 0:
-        init_params[0] = np.concatenate(
-            [init_params[1],
-             (np.random.randn(n_special, n_embd) * 0.02).astype(np.float32),
-             init_params[0]
-             ], 0)
-    else:
-        init_params[0] = np.concatenate(
-            [init_params[1],
-             init_params[0]
-             ], 0)
+    # if n_ctx > 0:
+    #     init_params[0] = init_params[0][:n_ctx]
+    # if n_special > 0:
+    #     init_params[0] = np.concatenate(
+    #         [init_params[1],
+    #          (np.random.randn(n_special, n_embd) * 0.02).astype(np.float32),
+    #          init_params[0]
+    #          ], 0)
+    # else:
+    #     init_params[0] = np.concatenate(
+    #         [init_params[1],
+    #          init_params[0]
+    #          ], 0)
+    # del init_params[1]
+    # if n_transfer == -1:
+    #     n_transfer = 0
+    # else:
+    #     n_transfer = 1 + n_transfer * 12
+
+    init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
     del init_params[1]
-    if n_transfer == -1:
-        n_transfer = 0
-    else:
-        n_transfer = 1 + n_transfer * 12
     init_params = [arr.squeeze() for arr in init_params]
 
+    # Construct model
+    if openai_config_file == "":
+        config = OpenAIGPTConfig()
+    else:
+        config = OpenAIGPTConfig(openai_config_file)
+    model = OpenAIGPTModel(config)
     try:
         assert model.embed.weight.shape == init_params[0].shape
     except AssertionError as e:
@@ -66,8 +74,10 @@ def load_openai_pretrained_model(model, n_ctx=-1, n_special=-1, n_transfer=12, n
         raise
 
     model.embed.weight.data = torch.from_numpy(init_params[0])
+    names.pop(0)
+    init_params.pop(0)
 
-    for name, ip in zip(names[1:n_transfer], init_params[1:n_transfer]):
+    for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]):
         name = name[6:]  # skip "model/"
         assert name[-2:] == ":0"
         name = name[:-2]
@@ -78,64 +88,22 @@ def load_openai_pretrained_model(model, n_ctx=-1, n_special=-1, n_transfer=12, n
                 l = re.split(r'(\d+)', m_name)
             else:
                 l = [m_name]
-            pointer = getattr(pointer, l[0])
-            if len(l) >= 2:
-                num = int(l[1])
-                pointer = pointer[num]
-        try:
-            assert pointer.shape == ip.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, ip.shape)
-            raise
-        pointer.data = torch.from_numpy(ip)
-
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
-    config_path = os.path.abspath(bert_config_file)
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {} with config at {}".format(tf_path, config_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    # Initialise PyTorch model
-    config = BertConfig.from_json_file(bert_config_file)
-    print("Building PyTorch model from configuration: {}".format(str(config)))
-    model = BertForPreTraining(config)
-
-    for name, array in zip(names, arrays):
-        name = name.split('/')
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(n in ["adam_v", "adam_m"] for n in name):
-            print("Skipping {}".format("/".join(name)))
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
-            else:
-                l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
+            if l[0] == 'g':
                 pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
+            elif l[0] == 'b':
                 pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
+            elif l[0] == 'w':
                 pointer = getattr(pointer, 'weight')
             else:
                 pointer = getattr(pointer, l[0])
             if len(l) >= 2:
                 num = int(l[1])
                 pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
-            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
         try:
             assert pointer.shape == array.shape
         except AssertionError as e:
@@ -145,30 +113,33 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor
         pointer.data = torch.from_numpy(array)
 
     # Save pytorch-model
-    print("Save PyTorch model to {}".format(pytorch_dump_path))
-    torch.save(model.state_dict(), pytorch_dump_path)
-
+    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
+    torch.save(model.state_dict(), pytorch_weights_dump_path)
+    print("Save configuration file to {}".format(pytorch_config_dump_path))
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(config.to_json_string())
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--tf_checkpoint_path",
+    parser.add_argument("--openai_checkpoint_folder_path",
                         default = None,
                         type = str,
                         required = True,
                         help = "Path the TensorFlow checkpoint path.")
-    parser.add_argument("--bert_config_file",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "The config json file corresponding to the pre-trained BERT model. \n"
-                            "This specifies the model architecture.")
-    parser.add_argument("--pytorch_dump_path",
+    parser.add_argument("--pytorch_dump_folder_path",
                         default = None,
                         type = str,
                         required = True,
                         help = "Path to the output PyTorch model.")
+    parser.add_argument("--openai_config_file",
+                        default = "",
+                        type = str,
+                        help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
+                            "This specifies the model architecture.")
     args = parser.parse_args()
-    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.bert_config_file,
-                                     args.pytorch_dump_path)
+    convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
+                                         args.pytorch_dump_folder_path,
+                                         args.openai_config_file)
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 2d5967fb46..d2a0cf8dd2 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -416,12 +416,12 @@ class BertPreTrainingHeads(nn.Module):
         return prediction_scores, seq_relationship_score
 
 
-class PreTrainedModel(nn.Module):
+class BertPreTrainedModel(nn.Module):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
     def __init__(self, config, *inputs, **kwargs):
-        super(PreTrainedModel, self).__init__()
+        super(BertPreTrainedModel, self).__init__()
         if not isinstance(config, BertConfig):
             raise ValueError(
                 "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
@@ -447,7 +447,7 @@ class PreTrainedModel(nn.Module):
     @classmethod
     def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, *inputs, **kwargs):
         """
-        Instantiate a PreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
 
         Params:
@@ -547,13 +547,16 @@ class PreTrainedModel(nn.Module):
         if len(unexpected_keys) > 0:
             logger.info("Weights from pretrained model not used in {}: {}".format(
                 model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               self.__class__.__name__, "\n\t".join(error_msgs)))
         if tempdir:
             # Clean up temp dir
             shutil.rmtree(tempdir)
         return model
 
 
-class BertModel(PreTrainedModel):
+class BertModel(BertPreTrainedModel):
     """BERT model ("Bidirectional Embedding Representations from a Transformer").
 
     Params:
@@ -636,7 +639,7 @@ class BertModel(PreTrainedModel):
         return encoded_layers, pooled_output
 
 
-class BertForPreTraining(PreTrainedModel):
+class BertForPreTraining(BertPreTrainedModel):
     """BERT model with pre-training heads.
     This module comprises the BERT model followed by the two pre-training heads:
         - the masked language modeling head, and
@@ -707,7 +710,7 @@ class BertForPreTraining(PreTrainedModel):
             return prediction_scores, seq_relationship_score
 
 
-class BertForMaskedLM(PreTrainedModel):
+class BertForMaskedLM(BertPreTrainedModel):
     """BERT model with the masked language modeling head.
     This module comprises the BERT model followed by the masked language modeling head.
 
@@ -768,7 +771,7 @@ class BertForMaskedLM(PreTrainedModel):
             return prediction_scores
 
 
-class BertForNextSentencePrediction(PreTrainedModel):
+class BertForNextSentencePrediction(BertPreTrainedModel):
     """BERT model with next sentence prediction head.
     This module comprises the BERT model followed by the next sentence classification head.
 
@@ -830,7 +833,7 @@ class BertForNextSentencePrediction(PreTrainedModel):
             return seq_relationship_score
 
 
-class BertForSequenceClassification(PreTrainedModel):
+class BertForSequenceClassification(BertPreTrainedModel):
     """BERT model for classification.
     This module is composed of the BERT model with a linear layer on top of
     the pooled output.
@@ -875,7 +878,7 @@ class BertForSequenceClassification(PreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_labels=2):
+    def __init__(self, config, num_labels):
         super(BertForSequenceClassification, self).__init__(config)
         self.num_labels = num_labels
         self.bert = BertModel(config)
@@ -896,7 +899,7 @@ class BertForSequenceClassification(PreTrainedModel):
             return logits
 
 
-class BertForMultipleChoice(PreTrainedModel):
+class BertForMultipleChoice(BertPreTrainedModel):
     """BERT model for multiple choice tasks.
     This module is composed of the BERT model with a linear layer on top of
     the pooled output.
@@ -940,7 +943,7 @@ class BertForMultipleChoice(PreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_choices=2):
+    def __init__(self, config, num_choices):
         super(BertForMultipleChoice, self).__init__(config)
         self.num_choices = num_choices
         self.bert = BertModel(config)
@@ -965,7 +968,7 @@ class BertForMultipleChoice(PreTrainedModel):
             return reshaped_logits
 
 
-class BertForTokenClassification(PreTrainedModel):
+class BertForTokenClassification(BertPreTrainedModel):
     """BERT model for token-level classification.
     This module is composed of the BERT model with a linear layer on top of
     the full hidden state of the last layer.
@@ -1010,7 +1013,7 @@ class BertForTokenClassification(PreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
-    def __init__(self, config, num_labels=2):
+    def __init__(self, config, num_labels):
         super(BertForTokenClassification, self).__init__(config)
         self.num_labels = num_labels
         self.bert = BertModel(config)
@@ -1031,7 +1034,7 @@ class BertForTokenClassification(PreTrainedModel):
             return logits
 
 
-class BertForQuestionAnswering(PreTrainedModel):
+class BertForQuestionAnswering(BertPreTrainedModel):
     """BERT model for Question Answering (span extraction).
     This module is composed of the BERT model with a linear layer on top of
     the sequence output that computes start_logits and end_logits
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 349baee79b..8e8ca0db00 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -1,16 +1,28 @@
+import os
 import copy
 import json
 import math
-import re
+import logging
+import tarfile
+import tempfile
+import shutil
 import collections
 
-import numpy as np
 import torch
 import torch.nn as nn
+from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
 from .modeling import BertLayerNorm as LayerNorm
+from .file_utils import cached_path
 
+logger = logging.getLogger(__name__)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt.tar.gz",
+}
+CONFIG_NAME = 'openai_gpt_config.json'
+WEIGHTS_NAME = 'pytorch_model.bin'
 
 def gelu(x):
     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
@@ -26,6 +38,237 @@ ACT_FNS = {
     'gelu': gelu
 }
 
+class OpenAIGPTConfig(object):
+    """Configuration class to store the configuration of a `OpenAIGPTModel`.
+    """
+    def __init__(self,
+                 vocab_size_or_config_json_file=40478,
+                 n_special=0,
+                 n_ctx=512,
+                 n_embd=768,
+                 n_layer=12,
+                 n_head=12,
+                 intermediate_size=3072,
+                 afn="gelu",
+                 resid_pdrop=0.1,
+                 embd_pdrop=0.1,
+                 attn_pdrop=0.1,
+                 type_vocab_size=2,
+                 initializer_range=0.02):
+        """Constructs OpenAIGPTConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
+            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
+            n_ctx: Number of positional embeddings.
+            n_embd: Dimensionality of the embeddings and hidden states.
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            afn: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            resid_pdrop: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attn_pdrop: The dropout ratio for the attention
+                probabilities.
+            embd_pdrop: The dropout ratio for the embeddings.
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `OpenAIGPTModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        if isinstance(vocab_size_or_config_json_file, str):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.n_special = n_special
+            self.n_ctx = n_ctx
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.afn = afn
+            self.intermediate_size = intermediate_size
+            self.resid_pdrop = resid_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.attn_pdrop = attn_pdrop
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             "or the path to a pretrained model config file (str)")
+
+    @property
+    def total_num_embeddings(self):
+        return self.vocab_size + self.n_special + self.n_ctx
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `OpenAIGPTConfig` from a Python dictionary of parameters."""
+        config = OpenAIGPTConfig(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `OpenAIGPTConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+class OpenAIGPTPreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(OpenAIGPTPreTrainedModel, self).__init__()
+        if not isinstance(config, OpenAIGPTConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `OpenAIGPTConfig`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        self.config = config
+
+    def init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def post_loading(self):
+        pass
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `openai-gpt`
+                - a path or url to a pretrained model archive containing:
+                    . `openai_gpt_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """
+        if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
+        else:
+            archive_file = pretrained_model_name
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except FileNotFoundError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                "associated to this path or url.".format(
+                    pretrained_model_name,
+                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                    archive_file))
+            return None
+        if resolved_archive_file == archive_file:
+            logger.info("loading archive file {}".format(archive_file))
+        else:
+            logger.info("loading archive file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+        tempdir = None
+        if os.path.isdir(resolved_archive_file):
+            serialization_dir = resolved_archive_file
+        else:
+            # Extract archive to temp dir
+            tempdir = tempfile.mkdtemp()
+            logger.info("extracting archive file {} to temp dir {}".format(
+                resolved_archive_file, tempdir))
+            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+                archive.extractall(tempdir)
+            serialization_dir = tempdir
+        # Load config
+        config_file = os.path.join(serialization_dir, CONFIG_NAME)
+        config = OpenAIGPTConfig.from_json_file(config_file)
+        logger.info("Model config {}".format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None:
+            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
+            state_dict = torch.load(weights_path)
+
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+        load(model.transformer if hasattr(model, 'transformer') else model, prefix='')
+        if len(missing_keys) > 0:
+            logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            logger.info("Weights from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               self.__class__.__name__, "\n\t".join(error_msgs)))
+        model.post_loading()
+        if tempdir:
+            # Clean up temp dir
+            shutil.rmtree(tempdir)
+        return model
 
 class Conv1D(nn.Module):
     def __init__(self, nf, rf, nx):
@@ -35,15 +278,15 @@ class Conv1D(nn.Module):
         if rf == 1:  # faster 1x1 conv
             w = torch.empty(nx, nf)
             nn.init.normal_(w, std=0.02)
-            self.w = Parameter(w)
-            self.b = Parameter(torch.zeros(nf))
+            self.weight = Parameter(w)
+            self.bias = Parameter(torch.zeros(nf))
         else:  # was used to train LM
             raise NotImplementedError
 
     def forward(self, x):
         if self.rf == 1:
             size_out = x.size()[:-1] + (self.nf,)
-            x = torch.addmm(self.b, x.view(-1, x.size(-1)), self.w)
+            x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
             x = x.view(*size_out)
         else:
             raise NotImplementedError
@@ -132,38 +375,18 @@ class Block(nn.Module):
         return h
 
 
-class TransformerModel(nn.Module):
-    """ Transformer model """
-
-    def __init__(self, cfg, vocab=40990, n_ctx=512):
-        super(TransformerModel, self).__init__()
-        self.vocab = vocab
-        self.embed = nn.Embedding(vocab, cfg.n_embd)
-        self.drop = nn.Dropout(cfg.embd_pdrop)
-        block = Block(n_ctx, cfg, scale=True)
-        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(cfg.n_layer)])
-
-        nn.init.normal_(self.embed.weight, std=0.02)
-
-    def forward(self, x):
-        x = x.view(-1, x.size(-2), x.size(-1))
-        e = self.embed(x)
-        # Add the position information to the input embeddings
-        h = e.sum(dim=2)
-        for block in self.h:
-            h = block(h)
-        return h
-
-
-class LMHead(nn.Module):
+class OpenAIGPTLMHead(nn.Module):
     """ Language Model Head for the transformer """
 
-    def __init__(self, model, cfg):
-        super(LMHead, self).__init__()
+    def __init__(self, model_embeddings_weights, cfg):
+        super(OpenAIGPTLMHead, self).__init__()
         self.n_embd = cfg.n_embd
-        embed_shape = model.embed.weight.shape
+        self.set_embeddings_weights(model_embeddings_weights)
+
+    def set_embeddings_weights(self, model_embeddings_weights):
+        embed_shape = model_embeddings_weights.shape
         self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
-        self.decoder.weight = model.embed.weight # Tied weights
+        self.decoder.weight = model_embeddings_weights # Tied weights
 
     def forward(self, h):
         # Truncated Language modeling logits (we remove the last token)
@@ -172,14 +395,14 @@ class LMHead(nn.Module):
         return lm_logits
 
 
-class MultipleChoiceHead(nn.Module):
+class OpenAIGPTClfHead(nn.Module):
     """ Classifier Head for the transformer """
 
     def __init__(self, clf_token, cfg):
-        super(MultipleChoiceHead, self).__init__()
+        super(OpenAIGPTClfHead, self).__init__()
         self.n_embd = cfg.n_embd
         self.clf_token = clf_token
-        self.dropout = nn.Dropout2d(cfg.clf_pdrop)  # To reproduce the noise_shape parameter of TF implementation
+        self.dropout = nn.Dropout2d(cfg.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
         self.linear = nn.Linear(cfg.n_embd, 1)
 
         nn.init.normal_(self.linear.weight, std = 0.02)
@@ -202,101 +425,71 @@ class MultipleChoiceHead(nn.Module):
         return clf_logits.view(-1, x.size(1))
 
 
-class ClfHead(nn.Module):
-    """Classification Head for the transformer
+class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
+    """ OpenAI GPT model """
 
-    TODO: test this class."""
-    def __init__(self, clf_token, cfg, n_class):
-        super(ClfHead, self).__init__()
-        self.n_embd = cfg.n_embd
-        self.clf_token = clf_token
-        self.dropout = nn.Dropout(cfg.clf_pdrop)
-        self.linear = nn.Linear(cfg.n_embd, n_class)
+    def __init__(self, cfg):
+        super(OpenAIGPTModel, self).__init__(cfg)
+        total_embeddings_size = cfg.vocab_size + cfg.n_special + cfg.n_ctx
+        self.embed = nn.Embedding(total_embeddings_size, cfg.n_embd)
+        self.drop = nn.Dropout(cfg.embd_pdrop)
+        block = Block(cfg.n_ctx, cfg, scale=True)
+        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(cfg.n_layer)])
 
-        nn.init.normal_(self.linear.weight, std = 0.02)
-        nn.init.normal_(self.linear.bias, 0)
+        self.apply(self.init_weights)
+        # nn.init.normal_(self.embed.weight, std=0.02)
 
-    def forward(self, h, x):
-        clf_h = h.view(-1, self.n_embd)
-        flat = x[..., 0].contiguous().view(-1)
-        clf_h = clf_h[flat == self.clf_token, :]
-        clf_h = self.dropout(clf_h)
-        clf_logits = self.linear(clf_h)
-
-        return clf_logits
-
-class SimilarityHead(nn.Module):
-    """ Similarity Head for the transformer
-
-        TODO: test this class."""
-    def __init__(self, clf_token, cfg):
-        super(SimilarityHead, self).__init__()
-        self.n_embd = cfg.n_embd
-        self.clf_token = clf_token
-        self.dropout = nn.Dropout(cfg.clf_pdrop)
-        self.linear = nn.Linear(cfg.n_embd, 1)
-
-        nn.init.normal_(self.linear.weight, std = 0.02)
-        nn.init.normal_(self.linear.bias, 0)
-
-    def forward(self, h, x):
-        sim_h = h.view(-1, self.n_embd)
-        flat = x[..., 0].contiguous().view(-1)
-        sim_h = sim_h[flat == self.clf_token, :]
-        sim_h = self.dropout(sim_h)
-        sim_h = sim_h.sum(dim = 1)
-        sim_logits = self.linear(sim_h)
-
-        return sim_logits
-
-class DoubleHeadModel(nn.Module):
-    """ Transformer with language model and task specific heads """
-    def __init__(self, cfg, clf_token, task_head_type, vocab=40990, n_ctx=512):
-        super(DoubleHeadModel, self).__init__()
-        self.transformer = TransformerModel(cfg, vocab=vocab, n_ctx=n_ctx)
-        self.lm_head = LMHead(self.transformer, cfg)
-        if isinstance(task_head_type, str):
-            if task_head_type == 'multiple_choice':
-                self.task_head = MultipleChoiceHead(clf_token, cfg)
-            elif task_head_type == 'similarity':
-                self.task_head = SimilarityHead(clf_token, cfg)
-            elif task_head_type == 'inference':
-                # the three classes correspond to entailment, contradiction and neutral.
-                self.task_head = ClfHead(clf_token, cfg, 3)
-            else:
-                raise ValueError("task_head_type is expected to be 'multiple_choice' "
-                                 "'similarity', 'inference' or ('classification', n_class) "
-                                 f"got {task_head_type}.")
-        elif isinstance(task_head_type, collections.abc.Sequence) and len(task_head_type) == 2 and \
-             task_head_type[0] == 'classification':
-            n_class = task_head_type[1]
-            self.task_head = ClfHead(clf_token, cfg, n_class)
-        else:
-            raise ValueError("task_head_type is expected to be 'multiple_choice' "
-                             "'similarity', 'inference' or ('classification', n_class) "
-                             f"got {task_head_type}.")
+    def set_num_special_tokens(self, num_special_tokens):
+        # Update config
+        self.config.n_special = num_special_tokens
+        # # Build new embeddings and initialize
+        old_embed = self.embed
+        self.embed = nn.Embedding(self.config.total_num_embeddings, self.config.n_embd)
+        # Initialize all new embeddings (in particular the special tokens)
+        self.init_weights(self.embed)
+        # Copy word and positional embeddings from the previous weights
+        self.embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
+        self.embed.weight.data[-self.config.n_ctx:, :] = old_embed.weight.data[-self.config.n_ctx:, :]
 
     def forward(self, x):
+        x = x.view(-1, x.size(-2), x.size(-1))
+        e = self.embed(x)
+        # Add the position information to the input embeddings
+        h = e.sum(dim=2)
+        for block in self.h:
+            h = block(h)
+        return h
+
+
+class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
+    """ OpenAI GPT model with language model and classification heads """
+    def __init__(self, cfg, clf_token='[CLS]'):
+        super(OpenAIGPTDoubleHeadsModel, self).__init__(cfg)
+        self.transformer = OpenAIGPTModel(cfg)
+        self.lm_head = OpenAIGPTLMHead(self.transformer.embed.weight, cfg)
+        self.clf_head = OpenAIGPTClfHead(clf_token, cfg)
+        self.apply(self.init_weights)
+
+    def post_loading(self):
+        " Set the number of special tokens to 1 (for the [CLS] token) "
+        self.set_num_special_tokens(1)
+
+    def set_num_special_tokens(self, num_special_tokens):
+        " Update input and output embeddings with new embedding matrice "
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
+
+    def forward(self, x, lm_labels=None, clf_labels=None):
         h = self.transformer(x)
         lm_logits = self.lm_head(h)
-        task_logits = self.task_head(h, x)
-
-        return lm_logits, task_logits
-
-
-class dotdict(dict):
-    """dot.notation access to dictionary attributes"""
-    __getattr__ = dict.get
-    __setattr__ = dict.__setitem__
-    __delattr__ = dict.__delitem__
-
-
-DEFAULT_CONFIG = dotdict({
-    'n_embd': 768,
-    'n_head': 12,
-    'n_layer': 12,
-    'embd_pdrop': 0.1,
-    'attn_pdrop': 0.1,
-    'resid_pdrop': 0.1,
-    'afn': 'gelu',
-    'clf_pdrop': 0.1})
+        clf_logits = self.clf_head(h, x)
+        losses = []
+        if lm_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            losses.append(loss_fct(lm_logits, lm_labels))
+        if clf_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            losses.append(loss_fct(clf_logits, clf_labels))
+        if losses:
+            return losses
+        return lm_logits, clf_logits
diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py
index 991d2699b3..4cc815c9ea 100644
--- a/pytorch_pretrained_bert/optimization_openai.py
+++ b/pytorch_pretrained_bert/optimization_openai.py
@@ -1,6 +1,23 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for OpenAI GPT model."""
+
 import math
 import torch
 from torch.optim import Optimizer
+from torch.optim.optimizer import required
 from torch.nn.utils import clip_grad_norm_
 
 def warmup_cosine(x, warmup=0.002):
@@ -25,26 +42,41 @@ SCHEDULES = {
 class OpenAIAdam(Optimizer):
     """Implements Open AI version of Adam algorithm with weight decay fix.
     """
-    def __init__(self, params, lr, schedule, warmup, t_total,
-                 b1=0.9, b2=0.999, e=1e-8, l2=0,
+    def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_total=-1,
+                 b1=0.9, b2=0.999, e=1e-8, weight_decay=0,
                  vector_l2=False, max_grad_norm=-1, **kwargs):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
         if schedule not in SCHEDULES:
             raise ValueError("Invalid schedule parameter: {}".format(schedule))
-        if not 0 <= warmup:
-            raise ValueError("Invalid warmup: {}".format(warmup))
+        if not 0.0 <= warmup < 1.0 and not warmup == -1:
+            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
         if not 0.0 <= b1 < 1.0:
             raise ValueError("Invalid b1 parameter: {}".format(b1))
         if not 0.0 <= b2 < 1.0:
             raise ValueError("Invalid b2 parameter: {}".format(b2))
-        if not 0.0 <= e:
+        if not e >= 0.0:
             raise ValueError("Invalid epsilon value: {}".format(e))
         defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
-                        b1=b1, b2=b2, e=e, l2=l2, vector_l2=vector_l2,
+                        b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
                         max_grad_norm=max_grad_norm)
         super(OpenAIAdam, self).__init__(params, defaults)
 
+    def get_lr(self):
+        lr = []
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                if len(state) == 0:
+                    return [0]
+                if group['t_total'] != -1:
+                    schedule_fct = SCHEDULES[group['schedule']]
+                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+                else:
+                    lr_scheduled = group['lr']
+                lr.append(lr_scheduled)
+        return lr
+
     def step(self, closure=None):
         """Performs a single optimization step.
 
@@ -91,14 +123,18 @@ class OpenAIAdam(Optimizer):
                 bias_correction1 = 1 - beta1 ** state['step']
                 bias_correction2 = 1 - beta2 ** state['step']
 
-                schedule_fct = SCHEDULES[group['schedule']]
-                lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+                if group['t_total'] != -1:
+                    schedule_fct = SCHEDULES[group['schedule']]
+                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+                else:
+                    lr_scheduled = group['lr']
+
                 step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
 
                 p.data.addcdiv_(-step_size, exp_avg, denom)
 
                 # Add weight decay at the end (fixed version)
-                if (len(p.size()) > 1 or group['vector_l2']) and group['l2'] > 0:
-                    p.data.add_(-lr_scheduled * group['l2'], p.data)
+                if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0:
+                    p.data.add_(-lr_scheduled * group['weight_decay'], p.data)
 
         return loss
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index 59d78f2f1e..dd0df83e93 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -1,9 +1,39 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+import os
 import re
-import ftfy
 import json
-import spacy
-
 from tqdm import tqdm
+import logging
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'openai-gpt': 512,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
 
 def get_pairs(word):
     """
@@ -32,16 +62,65 @@ def text_standardize(text):
     text = re.sub(r'[^\S\n]+', ' ', text)
     return text.strip()
 
-class TextEncoder(object):
+class OpenAIGPTTokenizer(object):
     """
     mostly a wrapper for a public python bpe tokenizer
     """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name]
+        else:
+            vocab_file = pretrained_model_name
+        if os.path.isdir(vocab_file):
+            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
+            merges_file = os.path.join(vocab_file, MERGES_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        except FileNotFoundError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                "associated to this path or url.".format(
+                    pretrained_model_name,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    vocab_file))
+            return None
+        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+            logger.info("loading merges file {}".format(merges_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+            logger.info("loading merges file {} from cache at {}".format(
+                merges_file, resolved_merges_file))
+        if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
+        return tokenizer
+
+    def __init__(self, vocab_file, merges_file):
+        try:
+            import ftfy
+            import spacy
+        except ImportError:
+            raise ImportError("Please install ftfy and spacy to use OpenAI GPT tokenizer.")
 
-    def __init__(self, encoder_path, bpe_path):
         self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
-        self.encoder = json.load(open(encoder_path))
+        self.encoder = json.load(open(vocab_file))
         self.decoder = {v:k for k,v in self.encoder.items()}
-        merges = open(bpe_path, encoding='utf-8').read().split('\n')[1:-1]
+        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
@@ -89,7 +168,7 @@ class TextEncoder(object):
         self.cache[token] = word
         return word
 
-    def encode(self, texts, verbose=True):
+    def tokenize(self, texts, verbose=True):
         texts_tokens = []
         if verbose:
             for text in tqdm(texts, ncols=80, leave=False):
diff --git a/setup.py b/setup.py
index e9b5c077c4..cf2f906100 100644
--- a/setup.py
+++ b/setup.py
@@ -37,8 +37,8 @@ from setuptools import find_packages, setup
 
 setup(
     name="pytorch_pretrained_bert",
-    version="0.4.0",
-    author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors",
+    version="0.5.0",
+    author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
     author_email="thomas@huggingface.co",
     description="PyTorch version of Google AI BERT model with script to load Google pre-trained models",
     long_description=open("README.md", "r", encoding='utf-8').read(),
@@ -55,7 +55,7 @@ setup(
                       'tqdm'],
     entry_points={
       'console_scripts': [
-        "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main"
+        "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main",
       ]
     },
     python_requires='>=3.5.0',

From 3cf12b235a032b57ea72d261d16f36b5684d754c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 8 Jan 2019 16:24:23 +0100
Subject: [PATCH 03/82] added tests + fixed losses

---
 pytorch_pretrained_bert/modeling.py           |   2 +-
 pytorch_pretrained_bert/modeling_openai.py    | 425 ++++++++++--------
 .../tokenization_openai.py                    |  90 ++--
 tests/modeling_openai_test.py                 | 192 ++++++++
 4 files changed, 484 insertions(+), 225 deletions(-)
 create mode 100644 tests/modeling_openai_test.py

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index d2a0cf8dd2..021d2334ca 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -549,7 +549,7 @@ class BertPreTrainedModel(nn.Module):
                 model.__class__.__name__, unexpected_keys))
         if len(error_msgs) > 0:
             raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                               self.__class__.__name__, "\n\t".join(error_msgs)))
+                               model.__class__.__name__, "\n\t".join(error_msgs)))
         if tempdir:
             # Clean up temp dir
             shutil.rmtree(tempdir)
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 8e8ca0db00..9442b1ed69 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -48,12 +48,10 @@ class OpenAIGPTConfig(object):
                  n_embd=768,
                  n_layer=12,
                  n_head=12,
-                 intermediate_size=3072,
                  afn="gelu",
                  resid_pdrop=0.1,
                  embd_pdrop=0.1,
                  attn_pdrop=0.1,
-                 type_vocab_size=2,
                  initializer_range=0.02):
         """Constructs OpenAIGPTConfig.
 
@@ -65,8 +63,6 @@ class OpenAIGPTConfig(object):
             n_layer: Number of hidden layers in the Transformer encoder.
             n_head: Number of attention heads for each attention layer in
                 the Transformer encoder.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
             afn: The non-linear activation function (function or string) in the
                 encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
             resid_pdrop: The dropout probabilitiy for all fully connected
@@ -74,8 +70,6 @@ class OpenAIGPTConfig(object):
             attn_pdrop: The dropout ratio for the attention
                 probabilities.
             embd_pdrop: The dropout ratio for the embeddings.
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `OpenAIGPTModel`.
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
         """
@@ -92,11 +86,9 @@ class OpenAIGPTConfig(object):
             self.n_layer = n_layer
             self.n_head = n_head
             self.afn = afn
-            self.intermediate_size = intermediate_size
             self.resid_pdrop = resid_pdrop
             self.embd_pdrop = embd_pdrop
             self.attn_pdrop = attn_pdrop
-            self.type_vocab_size = type_vocab_size
             self.initializer_range = initializer_range
         else:
             raise ValueError("First argument must be either a vocabulary size (int)"
@@ -133,6 +125,167 @@ class OpenAIGPTConfig(object):
         """Serializes this instance to a JSON string."""
         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 
+class Conv1D(nn.Module):
+    def __init__(self, nf, rf, nx):
+        super(Conv1D, self).__init__()
+        self.rf = rf
+        self.nf = nf
+        if rf == 1:  # faster 1x1 conv
+            w = torch.empty(nx, nf)
+            nn.init.normal_(w, std=0.02)
+            self.weight = Parameter(w)
+            self.bias = Parameter(torch.zeros(nf))
+        else:  # was used to train LM
+            raise NotImplementedError
+
+    def forward(self, x):
+        if self.rf == 1:
+            size_out = x.size()[:-1] + (self.nf,)
+            x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
+            x = x.view(*size_out)
+        else:
+            raise NotImplementedError
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, nx, n_ctx, cfg, scale=False):
+        super(Attention, self).__init__()
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        assert n_state % cfg.n_head == 0
+        self.register_buffer('b', torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+        self.n_head = cfg.n_head
+        self.split_size = n_state
+        self.scale = scale
+        self.c_attn = Conv1D(n_state * 3, 1, nx)
+        self.c_proj = Conv1D(n_state, 1, nx)
+        self.attn_dropout = nn.Dropout(cfg.attn_pdrop)
+        self.resid_dropout = nn.Dropout(cfg.resid_pdrop)
+
+    def _attn(self, q, k, v):
+        w = torch.matmul(q, k)
+        if self.scale:
+            w = w / math.sqrt(v.size(-1))
+        # w = w * self.b + -1e9 * (1 - self.b)  # TF implem method: mask_attn_weights
+        # XD: self.b may be larger than w, so we need to crop it
+        b = self.b[:, :, :w.size(-2), :w.size(-1)]
+        w = w * b + -1e9 * (1 - b)
+
+        w = nn.Softmax(dim=-1)(w)
+        w = self.attn_dropout(w)
+        return torch.matmul(w, v)
+
+    def merge_heads(self, x):
+        x = x.permute(0, 2, 1, 3).contiguous()
+        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
+        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
+
+    def split_heads(self, x, k=False):
+        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
+        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+        if k:
+            return x.permute(0, 2, 3, 1)
+        else:
+            return x.permute(0, 2, 1, 3)
+
+    def forward(self, x):
+        x = self.c_attn(x)
+        query, key, value = x.split(self.split_size, dim=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key, k=True)
+        value = self.split_heads(value)
+        a = self._attn(query, key, value)
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a)
+        return a
+
+
+class MLP(nn.Module):
+    def __init__(self, n_state, cfg):  # in MLP: n_state=3072 (4 * n_embd)
+        super(MLP, self).__init__()
+        nx = cfg.n_embd
+        self.c_fc = Conv1D(n_state, 1, nx)
+        self.c_proj = Conv1D(nx, 1, n_state)
+        self.act = ACT_FNS[cfg.afn]
+        self.dropout = nn.Dropout(cfg.resid_pdrop)
+
+    def forward(self, x):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        return self.dropout(h2)
+
+
+class Block(nn.Module):
+    def __init__(self, n_ctx, cfg, scale=False):
+        super(Block, self).__init__()
+        nx = cfg.n_embd
+        self.attn = Attention(nx, n_ctx, cfg, scale)
+        self.ln_1 = LayerNorm(nx)
+        self.mlp = MLP(4 * nx, cfg)
+        self.ln_2 = LayerNorm(nx)
+
+    def forward(self, x):
+        a = self.attn(x)
+        n = self.ln_1(x + a)
+        m = self.mlp(n)
+        h = self.ln_2(n + m)
+        return h
+
+
+class OpenAIGPTLMHead(nn.Module):
+    """ Language Model Head for the transformer """
+
+    def __init__(self, model_embeddings_weights, cfg):
+        super(OpenAIGPTLMHead, self).__init__()
+        self.n_embd = cfg.n_embd
+        self.set_embeddings_weights(model_embeddings_weights)
+
+    def set_embeddings_weights(self, model_embeddings_weights):
+        embed_shape = model_embeddings_weights.shape
+        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
+        self.decoder.weight = model_embeddings_weights # Tied weights
+
+    def forward(self, hidden_state):
+        # Truncated Language modeling logits (we remove the last token)
+        # h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
+        lm_logits = self.decoder(hidden_state)
+        return lm_logits
+
+
+class OpenAIGPTMultipleChoiceHead(nn.Module):
+    """ Classifier Head for the transformer """
+
+    def __init__(self, cfg):
+        super(OpenAIGPTMultipleChoiceHead, self).__init__()
+        self.n_embd = cfg.n_embd
+        # self.multiple_choice_token = multiple_choice_token
+        self.dropout = nn.Dropout2d(cfg.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
+        self.linear = nn.Linear(cfg.n_embd, 1)
+
+        nn.init.normal_(self.linear.weight, std = 0.02)
+        nn.init.normal_(self.linear.bias, 0)
+
+    def forward(self, hidden_states, classification_token_mask):
+        # Classification logits
+        # hidden_states = hidden_states.view(-1, self.n_embd)
+        # classification_token_mask = classification_token_mask.view(-1, 1).expand_as(hidden_states)
+        multiple_choice_h = hidden_states * classification_token_mask.unsqueeze(-1)
+        multiple_choice_h = multiple_choice_h.sum(dim=-2)
+        # flat = x[..., 0].contiguous().view(-1)
+        # multiple_choice_h = multiple_choice_h[flat == self.multiple_choice_token, :]
+        # multiple_choice_h = multiple_choice_h.view(-1, x.size(1), self.n_embd, 1)
+        # # This double transposition is there to replicate the behavior
+        # # of the noise_shape argument in the tensorflow
+        # # implementation.  For more details, see
+        # # https://github.com/huggingface/pytorch-openai-transformer-lm/issues/11
+        # multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
+        # multiple_choice_h = multiple_choice_h.contiguous().view(-1, self.n_embd)
+        multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
+        return multiple_choice_logits
+
+
 class OpenAIGPTPreTrainedModel(nn.Module):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
@@ -142,7 +295,7 @@ class OpenAIGPTPreTrainedModel(nn.Module):
         if not isinstance(config, OpenAIGPTConfig):
             raise ValueError(
                 "Parameter config in `{}(config)` should be an instance of class `OpenAIGPTConfig`. "
-                "To create a model from a Google pretrained model use "
+                "To create a model from a pretrained model use "
                 "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
                     self.__class__.__name__, self.__class__.__name__
                 ))
@@ -161,11 +314,12 @@ class OpenAIGPTPreTrainedModel(nn.Module):
         if isinstance(module, nn.Linear) and module.bias is not None:
             module.bias.data.zero_()
 
-    def post_loading(self):
+    def set_num_special_tokens(self, num_special_tokens):
         pass
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, *inputs, **kwargs):
+    def from_pretrained(cls, pretrained_model_name, num_special_tokens=0, state_dict=None, cache_dir=None,
+                        *inputs, **kwargs):
         """
         Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
@@ -178,7 +332,7 @@ class OpenAIGPTPreTrainedModel(nn.Module):
                     . `openai_gpt_config.json` a configuration file for the model
                     . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
             *inputs, **kwargs: additional input for the specific Bert class
                 (ex: num_labels for BertForSequenceClassification)
         """
@@ -263,167 +417,15 @@ class OpenAIGPTPreTrainedModel(nn.Module):
                 model.__class__.__name__, unexpected_keys))
         if len(error_msgs) > 0:
             raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                               self.__class__.__name__, "\n\t".join(error_msgs)))
-        model.post_loading()
+                               model.__class__.__name__, "\n\t".join(error_msgs)))
+        # Add additional embeddings for special tokens if needed
+        if num_special_tokens != config.n_special:
+            model.set_num_special_tokens(num_special_tokens)
         if tempdir:
             # Clean up temp dir
             shutil.rmtree(tempdir)
         return model
 
-class Conv1D(nn.Module):
-    def __init__(self, nf, rf, nx):
-        super(Conv1D, self).__init__()
-        self.rf = rf
-        self.nf = nf
-        if rf == 1:  # faster 1x1 conv
-            w = torch.empty(nx, nf)
-            nn.init.normal_(w, std=0.02)
-            self.weight = Parameter(w)
-            self.bias = Parameter(torch.zeros(nf))
-        else:  # was used to train LM
-            raise NotImplementedError
-
-    def forward(self, x):
-        if self.rf == 1:
-            size_out = x.size()[:-1] + (self.nf,)
-            x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
-            x = x.view(*size_out)
-        else:
-            raise NotImplementedError
-        return x
-
-
-class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, cfg, scale=False):
-        super(Attention, self).__init__()
-        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
-        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
-        assert n_state % cfg.n_head == 0
-        self.register_buffer('b', torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
-        self.n_head = cfg.n_head
-        self.split_size = n_state
-        self.scale = scale
-        self.c_attn = Conv1D(n_state * 3, 1, nx)
-        self.c_proj = Conv1D(n_state, 1, nx)
-        self.attn_dropout = nn.Dropout(cfg.attn_pdrop)
-        self.resid_dropout = nn.Dropout(cfg.resid_pdrop)
-
-    def _attn(self, q, k, v):
-        w = torch.matmul(q, k)
-        if self.scale:
-            w = w / math.sqrt(v.size(-1))
-        w = w * self.b + -1e9 * (1 - self.b)  # TF implem method: mask_attn_weights
-        w = nn.Softmax(dim=-1)(w)
-        w = self.attn_dropout(w)
-        return torch.matmul(w, v)
-
-    def merge_heads(self, x):
-        x = x.permute(0, 2, 1, 3).contiguous()
-        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
-        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
-
-    def split_heads(self, x, k=False):
-        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
-        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
-        if k:
-            return x.permute(0, 2, 3, 1)
-        else:
-            return x.permute(0, 2, 1, 3)
-
-    def forward(self, x):
-        x = self.c_attn(x)
-        query, key, value = x.split(self.split_size, dim=2)
-        query = self.split_heads(query)
-        key = self.split_heads(key, k=True)
-        value = self.split_heads(value)
-        a = self._attn(query, key, value)
-        a = self.merge_heads(a)
-        a = self.c_proj(a)
-        a = self.resid_dropout(a)
-        return a
-
-
-class MLP(nn.Module):
-    def __init__(self, n_state, cfg):  # in MLP: n_state=3072 (4 * n_embd)
-        super(MLP, self).__init__()
-        nx = cfg.n_embd
-        self.c_fc = Conv1D(n_state, 1, nx)
-        self.c_proj = Conv1D(nx, 1, n_state)
-        self.act = ACT_FNS[cfg.afn]
-        self.dropout = nn.Dropout(cfg.resid_pdrop)
-
-    def forward(self, x):
-        h = self.act(self.c_fc(x))
-        h2 = self.c_proj(h)
-        return self.dropout(h2)
-
-
-class Block(nn.Module):
-    def __init__(self, n_ctx, cfg, scale=False):
-        super(Block, self).__init__()
-        nx = cfg.n_embd
-        self.attn = Attention(nx, n_ctx, cfg, scale)
-        self.ln_1 = LayerNorm(nx)
-        self.mlp = MLP(4 * nx, cfg)
-        self.ln_2 = LayerNorm(nx)
-
-    def forward(self, x):
-        a = self.attn(x)
-        n = self.ln_1(x + a)
-        m = self.mlp(n)
-        h = self.ln_2(n + m)
-        return h
-
-
-class OpenAIGPTLMHead(nn.Module):
-    """ Language Model Head for the transformer """
-
-    def __init__(self, model_embeddings_weights, cfg):
-        super(OpenAIGPTLMHead, self).__init__()
-        self.n_embd = cfg.n_embd
-        self.set_embeddings_weights(model_embeddings_weights)
-
-    def set_embeddings_weights(self, model_embeddings_weights):
-        embed_shape = model_embeddings_weights.shape
-        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
-        self.decoder.weight = model_embeddings_weights # Tied weights
-
-    def forward(self, h):
-        # Truncated Language modeling logits (we remove the last token)
-        h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
-        lm_logits = self.decoder(h_trunc)
-        return lm_logits
-
-
-class OpenAIGPTClfHead(nn.Module):
-    """ Classifier Head for the transformer """
-
-    def __init__(self, clf_token, cfg):
-        super(OpenAIGPTClfHead, self).__init__()
-        self.n_embd = cfg.n_embd
-        self.clf_token = clf_token
-        self.dropout = nn.Dropout2d(cfg.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
-        self.linear = nn.Linear(cfg.n_embd, 1)
-
-        nn.init.normal_(self.linear.weight, std = 0.02)
-        nn.init.normal_(self.linear.bias, 0)
-
-    def forward(self, h, x):
-        # Classification logits
-        clf_h = h.view(-1, self.n_embd)
-        flat = x[..., 0].contiguous().view(-1)
-        clf_h = clf_h[flat == self.clf_token, :]
-        clf_h = clf_h.view(-1, x.size(1), self.n_embd, 1)
-        # This double transposition is there to replicate the behavior
-        # of the noise_shape argument in the tensorflow
-        # implementation.  For more details, see
-        # https://github.com/huggingface/pytorch-openai-transformer-lm/issues/11
-        clf_h = self.dropout(clf_h.transpose(1, 2)).transpose(1, 2)
-        clf_h = clf_h.contiguous().view(-1, self.n_embd)
-        clf_logits = self.linear(clf_h)
-
-        return clf_logits.view(-1, x.size(1))
-
 
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     """ OpenAI GPT model """
@@ -440,6 +442,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         # nn.init.normal_(self.embed.weight, std=0.02)
 
     def set_num_special_tokens(self, num_special_tokens):
+        " Update input embeddings with new embedding matrice "
         # Update config
         self.config.n_special = num_special_tokens
         # # Build new embeddings and initialize
@@ -451,45 +454,83 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         self.embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
         self.embed.weight.data[-self.config.n_ctx:, :] = old_embed.weight.data[-self.config.n_ctx:, :]
 
-    def forward(self, x):
-        x = x.view(-1, x.size(-2), x.size(-1))
-        e = self.embed(x)
+    def forward(self, input_ids, position_ids=None, token_type_ids=None):
+        if position_ids is None:
+            start = self.config.vocab_size + self.config.n_special
+            end = start + input_ids.size(-1)
+            position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        position_ids = position_ids.view(-1, position_ids.size(-1))
+
+        inputs_embeds = self.embed(input_ids)
+        position_embeds = self.embed(position_ids)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
+            token_type_embeds = self.embed(token_type_ids)
+        else:
+            token_type_embeds = 0
         # Add the position information to the input embeddings
-        h = e.sum(dim=2)
+        # h = e.sum(dim=2)
+        hidden_states = inputs_embeds + position_embeds + token_type_embeds
         for block in self.h:
-            h = block(h)
-        return h
+            hidden_states = block(hidden_states)
+        return hidden_states.view(*input_shape, hidden_states.size(-1))
 
-
-class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
+class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     """ OpenAI GPT model with language model and classification heads """
-    def __init__(self, cfg, clf_token='[CLS]'):
-        super(OpenAIGPTDoubleHeadsModel, self).__init__(cfg)
+    def __init__(self, cfg):
+        super(OpenAIGPTLMHeadModel, self).__init__(cfg)
         self.transformer = OpenAIGPTModel(cfg)
         self.lm_head = OpenAIGPTLMHead(self.transformer.embed.weight, cfg)
-        self.clf_head = OpenAIGPTClfHead(clf_token, cfg)
         self.apply(self.init_weights)
 
-    def post_loading(self):
-        " Set the number of special tokens to 1 (for the [CLS] token) "
-        self.set_num_special_tokens(1)
-
     def set_num_special_tokens(self, num_special_tokens):
         " Update input and output embeddings with new embedding matrice "
         self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
 
-    def forward(self, x, lm_labels=None, clf_labels=None):
-        h = self.transformer(x)
-        lm_logits = self.lm_head(h)
-        clf_logits = self.clf_head(h, x)
-        losses = []
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None):
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
+        lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
             loss_fct = CrossEntropyLoss()
-            losses.append(loss_fct(lm_logits, lm_labels))
-        if clf_labels is not None:
+            loss = loss_fct(lm_logits, lm_labels)
+            return loss
+        return lm_logits
+
+class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
+    """ OpenAI GPT model with language model and classification heads """
+    def __init__(self, cfg):
+        super(OpenAIGPTDoubleHeadsModel, self).__init__(cfg)
+        self.transformer = OpenAIGPTModel(cfg)
+        self.lm_head = OpenAIGPTLMHead(self.transformer.embed.weight, cfg)
+        self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(cfg)
+        self.apply(self.init_weights)
+
+    def set_num_special_tokens(self, num_special_tokens):
+        " Update input and output embeddings with new embedding matrice "
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
+
+    def forward(self, input_ids, classification_token_mask, position_ids=None, token_type_ids=None,
+                lm_labels=None, multiple_choice_labels=None):
+        """ 
+            input_ids as to be of shape B x C x S
+            lm_labels can be masked using the -1 value
+        """
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
+        lm_logits = self.lm_head(hidden_states)
+        multiple_choice_logits = self.multiple_choice_head(hidden_states, classification_token_mask)
+        losses = []
+        if lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
+        if multiple_choice_labels is not None:
             loss_fct = CrossEntropyLoss()
-            losses.append(loss_fct(clf_logits, clf_labels))
+            losses.append(loss_fct(multiple_choice_logits, multiple_choice_labels.view(-1)))
         if losses:
             return losses
-        return lm_logits, clf_logits
+        return lm_logits, multiple_choice_logits
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index dd0df83e93..1492075817 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -67,19 +67,17 @@ class OpenAIGPTTokenizer(object):
     mostly a wrapper for a public python bpe tokenizer
     """
     @classmethod
-    def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
         """
         Instantiate a PreTrainedBertModel from a pre-trained model file.
         Download and cache the pre-trained model file if needed.
         """
-        if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
-            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name]
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
         else:
-            vocab_file = pretrained_model_name
-        if os.path.isdir(vocab_file):
-            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
-            merges_file = os.path.join(vocab_file, MERGES_NAME)
+            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
         # redirect to the cache, if necessary
         try:
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
@@ -87,11 +85,12 @@ class OpenAIGPTTokenizer(object):
         except FileNotFoundError:
             logger.error(
                 "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name,
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
                     ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                    vocab_file))
+                    pretrained_model_name_or_path,
+                    vocab_file, merges_file))
             return None
         if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
             logger.info("loading vocabulary file {}".format(vocab_file))
@@ -101,29 +100,38 @@ class OpenAIGPTTokenizer(object):
                 vocab_file, resolved_vocab_file))
             logger.info("loading merges file {} from cache at {}".format(
                 merges_file, resolved_merges_file))
-        if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
             # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
             # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name]
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
             kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
         # Instantiate tokenizer.
         tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
         return tokenizer
 
-    def __init__(self, vocab_file, merges_file):
+    def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
         try:
             import ftfy
             import spacy
         except ImportError:
             raise ImportError("Please install ftfy and spacy to use OpenAI GPT tokenizer.")
 
+        self.max_len = max_len if max_len is not None else int(1e12)
         self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
+        self.fix_text = ftfy.fix_text
         self.encoder = json.load(open(vocab_file))
         self.decoder = {v:k for k,v in self.encoder.items()}
         merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
+        if not special_tokens:
+            self.special_tokens = {}
+        else:
+            self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
+
+    def set_special_tokens(self, special_tokens):
+        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
 
     def bpe(self, token):
         word = tuple(token[:-1]) + ( token[-1] + '</w>',)
@@ -168,20 +176,38 @@ class OpenAIGPTTokenizer(object):
         self.cache[token] = word
         return word
 
-    def tokenize(self, texts, verbose=True):
-        texts_tokens = []
-        if verbose:
-            for text in tqdm(texts, ncols=80, leave=False):
-                text = self.nlp(text_standardize(ftfy.fix_text(text)))
-                text_tokens = []
-                for token in text:
-                    text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
-                texts_tokens.append(text_tokens)
-        else:
-            for text in texts:
-                text = self.nlp(text_standardize(ftfy.fix_text(text)))
-                text_tokens = []
-                for token in text:
-                    text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
-                texts_tokens.append(text_tokens)
-        return texts_tokens
+    def tokenize(self, text):
+        split_tokens = []
+        text = self.nlp(text_standardize(self.fix_text(text)))
+        for token in text:
+            split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            raise ValueError(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this BERT model ({} > {}). Running this"
+                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.decoder[i])
+        return tokens
+
+    def decode(self, ids):
+        """Converts a sequence of ids in a string."""
+        tokens = self.convert_ids_to_tokens(ids)
+        out_string = ''.join(tokens).replace('</w>', ' ')
+        return out_string
diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py
new file mode 100644
index 0000000000..539fbda9e4
--- /dev/null
+++ b/tests/modeling_openai_test.py
@@ -0,0 +1,192 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import json
+import random
+
+import torch
+
+from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTDoubleHeadsModel)
+
+
+class OpenAIGPTModelTest(unittest.TestCase):
+    class OpenAIGPTModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_position_ids=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     n_special=1,
+                     n_ctx=33,
+                     n_embd=32,
+                     n_layer=5,
+                     n_head=4,
+                     n_choices=3,
+                     afn="gelu",
+                     resid_pdrop=0.1,
+                     attn_pdrop=0.1,
+                     embd_pdrop=0.1,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     scope=None):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_position_ids = use_position_ids
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.n_special = n_special
+            self.n_ctx = n_ctx
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.afn = afn
+            self.n_choices = n_choices
+            self.resid_pdrop = resid_pdrop
+            self.attn_pdrop = attn_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)
+
+            position_ids = None
+            if self.use_position_ids:
+                position_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_ctx)
+                position_ids = position_ids + self.n_special + self.vocab_size
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                total_voc = self.n_ctx + self.n_special + self.vocab_size
+                token_type_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
+
+            multiple_choice_labels = None
+            lm_labels = None
+            classification_token_mask = None
+            if self.use_labels:
+                multiple_choice_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
+                lm_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
+                classification_token_mask = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], 2).float()
+
+            config = OpenAIGPTConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_ctx=self.n_ctx,
+                n_special=self.n_special,
+                n_embd=self.n_embd,
+                n_layer=self.n_layer,
+                n_head=self.n_head,
+                afn=self.afn,
+                resid_pdrop=self.resid_pdrop,
+                attn_pdrop=self.attn_pdrop,
+                embd_pdrop=self.embd_pdrop,
+                initializer_range=self.initializer_range)
+
+            return (config, input_ids, token_type_ids, position_ids,
+                    multiple_choice_labels, lm_labels, classification_token_mask)
+
+        def create_openai_model(self, config, input_ids, token_type_ids, position_ids,
+                                multiple_choice_labels, lm_labels, classification_token_mask):
+            model = OpenAIGPTModel(config)
+            hidden_states = model(input_ids, position_ids, token_type_ids)
+            outputs = {
+                "hidden_states": hidden_states,
+            }
+            return outputs
+
+        def check_openai_model_output(self, result):
+            self.parent.assertListEqual(
+                list(result["hidden_states"].size()),
+                [self.batch_size, self.n_choices, self.seq_length, self.n_embd])
+
+
+        def create_openai_double_heads(self, config, input_ids, token_type_ids, position_ids,
+                                       multiple_choice_labels, lm_labels, classification_token_mask):
+            model = OpenAIGPTDoubleHeadsModel(config)
+            loss = model(input_ids, classification_token_mask, position_ids,
+                         token_type_ids, lm_labels, multiple_choice_labels)
+            lm_logits, multiple_choice_logits = model(input_ids, classification_token_mask, position_ids, token_type_ids)
+            outputs = {
+                "loss": loss,
+                "lm_logits": lm_logits,
+                "multiple_choice_logits": multiple_choice_logits,
+            }
+            return outputs
+
+        def check_openai_double_heads_output(self, result):
+            total_voc = self.n_ctx + self.n_special + self.vocab_size
+            self.parent.assertListEqual(
+                list(result["lm_logits"].size()),
+                [self.batch_size, self.n_choices, self.seq_length, total_voc])
+            self.parent.assertListEqual(
+                list(result["multiple_choice_logits"].size()),
+                [self.batch_size, self.n_choices])
+
+        def check_openai_double_heads_loss_output(self, result):
+            self.parent.assertListEqual(
+                [list(l.size()) for l in result["loss"]],
+                [[], []])
+
+    def test_default(self):
+        self.run_tester(OpenAIGPTModelTest.OpenAIGPTModelTester(self))
+
+    def test_config_to_json_string(self):
+        config = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37)
+        obj = json.loads(config.to_json_string())
+        self.assertEqual(obj["vocab_size"], 99)
+        self.assertEqual(obj["n_embd"], 37)
+
+    def run_tester(self, tester):
+        config_and_inputs = tester.prepare_config_and_inputs()
+        output_result = tester.create_openai_model(*config_and_inputs)
+        tester.check_openai_model_output(output_result)
+
+        output_result = tester.create_openai_double_heads(*config_and_inputs)
+        tester.check_openai_double_heads_output(output_result)
+        tester.check_openai_double_heads_loss_output(output_result)
+
+    @classmethod
+    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
+        """Creates a random int32 tensor of the shape within the vocab size."""
+        if rng is None:
+            rng = random.Random()
+
+        total_dims = 1
+        for dim in shape:
+            total_dims *= dim
+
+        values = []
+        for _ in range(total_dims):
+            values.append(rng.randint(0, vocab_size - 1))
+
+        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+
+
+if __name__ == "__main__":
+    unittest.main()

From dc5df92fa89cae216cb8a186a544d6d852b35a15 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 8 Jan 2019 17:18:47 +0100
Subject: [PATCH 04/82] added LM head for OpenAI

---
 pytorch_pretrained_bert/__init__.py        |  3 +-
 pytorch_pretrained_bert/modeling_openai.py | 17 ++++-----
 tests/modeling_openai_test.py              | 43 ++++++++++++++++++----
 3 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index c940549364..0a9e41266d 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -5,7 +5,8 @@ from .modeling import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
                        BertForSequenceClassification, BertForMultipleChoice,
                        BertForTokenClassification, BertForQuestionAnswering)
-from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTDoubleHeadsModel
+from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
+                              OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
 from .optimization import BertAdam
 from .optimization_openai import OpenAIAdam
 from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 9442b1ed69..bde481c7b1 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -267,11 +267,11 @@ class OpenAIGPTMultipleChoiceHead(nn.Module):
         nn.init.normal_(self.linear.weight, std = 0.02)
         nn.init.normal_(self.linear.bias, 0)
 
-    def forward(self, hidden_states, classification_token_mask):
+    def forward(self, hidden_states, multiple_choice_token_mask):
         # Classification logits
         # hidden_states = hidden_states.view(-1, self.n_embd)
-        # classification_token_mask = classification_token_mask.view(-1, 1).expand_as(hidden_states)
-        multiple_choice_h = hidden_states * classification_token_mask.unsqueeze(-1)
+        # multiple_choice_token_mask = multiple_choice_token_mask.view(-1, 1).expand_as(hidden_states)
+        multiple_choice_h = hidden_states * multiple_choice_token_mask.unsqueeze(-1)
         multiple_choice_h = multiple_choice_h.sum(dim=-2)
         # flat = x[..., 0].contiguous().view(-1)
         # multiple_choice_h = multiple_choice_h[flat == self.multiple_choice_token, :]
@@ -496,8 +496,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(lm_logits, lm_labels)
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
             return loss
         return lm_logits
 
@@ -515,15 +515,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
 
-    def forward(self, input_ids, classification_token_mask, position_ids=None, token_type_ids=None,
+    def forward(self, input_ids, multiple_choice_token_mask, position_ids=None, token_type_ids=None,
                 lm_labels=None, multiple_choice_labels=None):
-        """ 
-            input_ids as to be of shape B x C x S
+        """ input_ids should be of shape B x C x S
             lm_labels can be masked using the -1 value
         """
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
         lm_logits = self.lm_head(hidden_states)
-        multiple_choice_logits = self.multiple_choice_head(hidden_states, classification_token_mask)
+        multiple_choice_logits = self.multiple_choice_head(hidden_states, multiple_choice_token_mask)
         losses = []
         if lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py
index 539fbda9e4..0a71166443 100644
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@@ -22,7 +22,8 @@ import random
 
 import torch
 
-from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTDoubleHeadsModel)
+from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel,
+                                     OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
 
 
 class OpenAIGPTModelTest(unittest.TestCase):
@@ -89,11 +90,11 @@ class OpenAIGPTModelTest(unittest.TestCase):
 
             multiple_choice_labels = None
             lm_labels = None
-            classification_token_mask = None
+            multiple_choice_token_mask = None
             if self.use_labels:
                 multiple_choice_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
                 lm_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
-                classification_token_mask = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], 2).float()
+                multiple_choice_token_mask = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], 2).float()
 
             config = OpenAIGPTConfig(
                 vocab_size_or_config_json_file=self.vocab_size,
@@ -109,10 +110,10 @@ class OpenAIGPTModelTest(unittest.TestCase):
                 initializer_range=self.initializer_range)
 
             return (config, input_ids, token_type_ids, position_ids,
-                    multiple_choice_labels, lm_labels, classification_token_mask)
+                    multiple_choice_labels, lm_labels, multiple_choice_token_mask)
 
         def create_openai_model(self, config, input_ids, token_type_ids, position_ids,
-                                multiple_choice_labels, lm_labels, classification_token_mask):
+                                multiple_choice_labels, lm_labels, multiple_choice_token_mask):
             model = OpenAIGPTModel(config)
             hidden_states = model(input_ids, position_ids, token_type_ids)
             outputs = {
@@ -126,12 +127,34 @@ class OpenAIGPTModelTest(unittest.TestCase):
                 [self.batch_size, self.n_choices, self.seq_length, self.n_embd])
 
 
+        def create_openai_lm_head(self, config, input_ids, token_type_ids, position_ids,
+                                       multiple_choice_labels, lm_labels, multiple_choice_token_mask):
+            model = OpenAIGPTLMHeadModel(config)
+            loss = model(input_ids, position_ids, token_type_ids, lm_labels)
+            lm_logits = model(input_ids, position_ids, token_type_ids)
+            outputs = {
+                "loss": loss,
+                "lm_logits": lm_logits,
+            }
+            return outputs
+
+        def check_openai_lm_head_output(self, result):
+            total_voc = self.n_ctx + self.n_special + self.vocab_size
+            self.parent.assertListEqual(
+                list(result["lm_logits"].size()),
+                [self.batch_size, self.n_choices, self.seq_length, total_voc])
+
+        def check_openai_lm_head_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
         def create_openai_double_heads(self, config, input_ids, token_type_ids, position_ids,
-                                       multiple_choice_labels, lm_labels, classification_token_mask):
+                                       multiple_choice_labels, lm_labels, multiple_choice_token_mask):
             model = OpenAIGPTDoubleHeadsModel(config)
-            loss = model(input_ids, classification_token_mask, position_ids,
+            loss = model(input_ids, multiple_choice_token_mask, position_ids,
                          token_type_ids, lm_labels, multiple_choice_labels)
-            lm_logits, multiple_choice_logits = model(input_ids, classification_token_mask, position_ids, token_type_ids)
+            lm_logits, multiple_choice_logits = model(input_ids, multiple_choice_token_mask, position_ids, token_type_ids)
             outputs = {
                 "loss": loss,
                 "lm_logits": lm_logits,
@@ -167,6 +190,10 @@ class OpenAIGPTModelTest(unittest.TestCase):
         output_result = tester.create_openai_model(*config_and_inputs)
         tester.check_openai_model_output(output_result)
 
+        output_result = tester.create_openai_lm_head(*config_and_inputs)
+        tester.check_openai_lm_head_output(output_result)
+        tester.check_openai_lm_head_loss_output(output_result)
+
         output_result = tester.create_openai_double_heads(*config_and_inputs)
         tester.check_openai_double_heads_output(output_result)
         tester.check_openai_double_heads_loss_output(output_result)

From ab90d4cdddd05c2b66db5851bc6d01e484c67df7 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 9 Jan 2019 00:12:43 +0100
Subject: [PATCH 05/82] adding docs and example for OpenAI GPT

---
 examples/run_openai_gpt.py                 | 304 +++++++++++++++++++++
 pytorch_pretrained_bert/modeling.py        |   4 +-
 pytorch_pretrained_bert/modeling_openai.py | 245 ++++++++++++++---
 3 files changed, 510 insertions(+), 43 deletions(-)
 create mode 100644 examples/run_openai_gpt.py

diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
new file mode 100644
index 0000000000..4d00276d88
--- /dev/null
+++ b/examples/run_openai_gpt.py
@@ -0,0 +1,304 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+" Run OpenAI GPT on RocStories"
+import argparse
+import os
+import random
+import logging
+
+from sklearn.metrics import accuracy_score
+from sklearn.utils import shuffle
+
+# from analysis import rocstories as rocstories_analysis
+# from datasets import rocstories
+# from model_pytorch import DoubleHeadModel, load_openai_pretrained_model
+# from opt import OpenAIAdam
+# from text_utils import TextEncoder
+# from utils import (encode_dataset, iter_data,
+#                    ResultLogger, make_path)
+# from loss import MultipleChoiceLossCompute
+
+import numpy as np
+import torch
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data.distributed import DistributedSampler
+
+from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
+from pytorch_pretrained_bert.modeling_openai import OpenAIGPTDoubleHeadsModel
+from pytorch_pretrained_bert.optimization_openai import OpenAIAdam
+from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+def transform_roc(X1, X2, X3):
+    n_batch = len(X1)
+    xmb = np.zeros((n_batch, 2, n_ctx, 2), dtype=np.int32)
+    mmb = np.zeros((n_batch, 2, n_ctx), dtype=np.float32)
+    start = encoder['_start_']
+    delimiter = encoder['_delimiter_']
+    for i, (x1, x2, x3), in enumerate(zip(X1, X2, X3)):
+        x12 = [start] + x1[:max_len] + [delimiter] + x2[:max_len] + [clf_token]
+        x13 = [start] + x1[:max_len] + [delimiter] + x3[:max_len] + [clf_token]
+        l12 = len(x12)
+        l13 = len(x13)
+        xmb[i, 0, :l12, 0] = x12
+        xmb[i, 1, :l13, 0] = x13
+        mmb[i, 0, :l12] = 1
+        mmb[i, 1, :l13] = 1
+    # Position information that is added to the input embeddings in the TransformerModel
+    xmb[:, :, :, 1] = np.arange(n_vocab + n_special, n_vocab + n_special + n_ctx)
+    return xmb, mmb
+
+
+def iter_apply(Xs, Ms, Ys):
+    # fns = [lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x))]
+    logits = []
+    cost = 0
+    with torch.no_grad():
+        dh_model.eval()
+        for xmb, mmb, ymb in iter_data(Xs, Ms, Ys, n_batch=n_batch_train, truncate=False, verbose=True):
+            n = len(xmb)
+            XMB = torch.tensor(xmb, dtype=torch.long).to(device)
+            YMB = torch.tensor(ymb, dtype=torch.long).to(device)
+            MMB = torch.tensor(mmb).to(device)
+            _, clf_logits = dh_model(XMB)
+            clf_logits *= n
+            clf_losses = compute_loss_fct(XMB, YMB, MMB, clf_logits, only_return_losses=True)
+            clf_losses *= n
+            logits.append(clf_logits.to("cpu").numpy())
+            cost += clf_losses.sum().item()
+        logits = np.concatenate(logits, 0)
+    return logits, cost
+
+
+def iter_predict(Xs, Ms):
+    logits = []
+    with torch.no_grad():
+        dh_model.eval()
+        for xmb, mmb in iter_data(Xs, Ms, n_batch=n_batch_train, truncate=False, verbose=True):
+            n = len(xmb)
+            XMB = torch.tensor(xmb, dtype=torch.long).to(device)
+            MMB = torch.tensor(mmb).to(device)
+            _, clf_logits = dh_model(XMB)
+            logits.append(clf_logits.to("cpu").numpy())
+    logits = np.concatenate(logits, 0)
+    return logits
+
+
+def log(save_dir, desc):
+    global best_score
+    print("Logging")
+    tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid], trY[:n_valid])
+    va_logits, va_cost = iter_apply(vaX, vaM, vaY)
+    tr_cost = tr_cost / len(trY[:n_valid])
+    va_cost = va_cost / n_valid
+    tr_acc = accuracy_score(trY[:n_valid], np.argmax(tr_logits, 1)) * 100.
+    va_acc = accuracy_score(vaY, np.argmax(va_logits, 1)) * 100.
+    logger.log(n_epochs=n_epochs, n_updates=n_updates, tr_cost=tr_cost, va_cost=va_cost, tr_acc=tr_acc, va_acc=va_acc)
+    print('%d %d %.3f %.3f %.2f %.2f' % (n_epochs, n_updates, tr_cost, va_cost, tr_acc, va_acc))
+    if submit:
+        score = va_acc
+        if score > best_score:
+            best_score = score
+            path = os.path.join(save_dir, desc, 'best_params')
+            torch.save(dh_model.state_dict(), make_path(path))
+
+
+def predict(dataset, submission_dir):
+    filename = filenames[dataset]
+    pred_fn = pred_fns[dataset]
+    label_decoder = label_decoders[dataset]
+    predictions = pred_fn(iter_predict(teX, teM))
+    if label_decoder is not None:
+        predictions = [label_decoder[prediction] for prediction in predictions]
+    path = os.path.join(submission_dir, filename)
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with open(path, 'w') as f:
+        f.write('{}\t{}\n'.format('index', 'prediction'))
+        for i, prediction in enumerate(predictions):
+            f.write('{}\t{}\n'.format(i, prediction))
+
+
+def run_epoch():
+    for xmb, mmb, ymb in iter_data(*shuffle(trX, trM, trYt, random_state=np.random),
+                                   n_batch=n_batch_train, truncate=True, verbose=True):
+        global n_updates
+        dh_model.train()
+        XMB = torch.tensor(xmb, dtype=torch.long).to(device)
+        YMB = torch.tensor(ymb, dtype=torch.long).to(device)
+        MMB = torch.tensor(mmb).to(device)
+        lm_logits, clf_logits = dh_model(XMB)
+        compute_loss_fct(XMB, YMB, MMB, clf_logits, lm_logits)
+        n_updates += 1
+        if n_updates in [1000, 2000, 4000, 8000, 16000, 32000] and n_epochs == 0:
+            log(save_dir, desc)
+
+
+argmax = lambda x: np.argmax(x, 1)
+
+pred_fns = {
+    'rocstories': argmax,
+}
+
+filenames = {
+    'rocstories': 'ROCStories.tsv',
+}
+
+label_decoders = {
+    'rocstories': None,
+}
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--desc', type=str, help="Description")
+    parser.add_argument('--dataset', type=str)
+    parser.add_argument('--log_dir', type=str, default='log/')
+    parser.add_argument('--save_dir', type=str, default='save/')
+    parser.add_argument('--data_dir', type=str, default='data/')
+    parser.add_argument('--submission_dir', type=str, default='submission/')
+    parser.add_argument('--submit', action='store_true')
+    parser.add_argument('--analysis', action='store_true')
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--n_iter', type=int, default=3)
+    parser.add_argument('--n_batch', type=int, default=8)
+    parser.add_argument('--max_grad_norm', type=int, default=1)
+    parser.add_argument('--lr', type=float, default=6.25e-5)
+    parser.add_argument('--lr_warmup', type=float, default=0.002)
+    parser.add_argument('--n_ctx', type=int, default=512)
+    parser.add_argument('--n_embd', type=int, default=768)
+    parser.add_argument('--n_head', type=int, default=12)
+    parser.add_argument('--n_layer', type=int, default=12)
+    parser.add_argument('--embd_pdrop', type=float, default=0.1)
+    parser.add_argument('--attn_pdrop', type=float, default=0.1)
+    parser.add_argument('--resid_pdrop', type=float, default=0.1)
+    parser.add_argument('--clf_pdrop', type=float, default=0.1)
+    parser.add_argument('--l2', type=float, default=0.01)
+    parser.add_argument('--vector_l2', action='store_true')
+    parser.add_argument('--opt', type=str, default='adam')
+    parser.add_argument('--afn', type=str, default='gelu')
+    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
+    parser.add_argument('--encoder_path', type=str, default='model/encoder_bpe_40000.json')
+    parser.add_argument('--bpe_path', type=str, default='model/vocab_40000.bpe')
+    parser.add_argument('--n_transfer', type=int, default=12)
+    parser.add_argument('--lm_coef', type=float, default=0.5)
+    parser.add_argument('--b1', type=float, default=0.9)
+    parser.add_argument('--b2', type=float, default=0.999)
+    parser.add_argument('--e', type=float, default=1e-8)
+    parser.add_argument('--n_valid', type=int, default=374)
+
+    args = parser.parse_args()
+    print(args)
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed_all(args.seed)
+
+    # Constants
+    submit = args.submit
+    dataset = args.dataset
+    n_ctx = args.n_ctx
+    save_dir = args.save_dir
+    desc = args.desc
+    data_dir = args.data_dir
+    log_dir = args.log_dir
+    submission_dir = args.submission_dir
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    n_gpu = torch.cuda.device_count()
+    print("device", device, "n_gpu", n_gpu)
+
+    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
+    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
+    encoder = text_encoder.encoder
+    n_vocab = len(text_encoder.encoder)
+
+    print("Encoding dataset...")
+    ((trX1, trX2, trX3, trY),
+     (vaX1, vaX2, vaX3, vaY),
+     (teX1, teX2, teX3)) = encode_dataset(*rocstories(data_dir, n_valid=args.n_valid),
+                                          encoder=text_encoder)
+    encoder['_start_'] = len(encoder)
+    encoder['_delimiter_'] = len(encoder)
+    encoder['_classify_'] = len(encoder)
+    clf_token = encoder['_classify_']
+    n_special = 3
+    max_len = n_ctx // 2 - 2
+    n_ctx = min(max(
+        [len(x1[:max_len]) + max(len(x2[:max_len]),
+                                 len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)]
+        + [len(x1[:max_len]) + max(len(x2[:max_len]),
+                                   len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)]
+        + [len(x1[:max_len]) + max(len(x2[:max_len]),
+                                   len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)]
+        ) + 3, n_ctx)
+    vocab = n_vocab + n_special + n_ctx
+    trX, trM = transform_roc(trX1, trX2, trX3)
+    vaX, vaM = transform_roc(vaX1, vaX2, vaX3)
+    if submit:
+        teX, teM = transform_roc(teX1, teX2, teX3)
+
+    n_train = len(trY)
+    n_valid = len(vaY)
+    n_batch_train = args.n_batch * max(n_gpu, 1)
+    n_updates_total = (n_train // n_batch_train) * args.n_iter
+
+    dh_model = DoubleHeadModel(args, clf_token, 'multiple_choice', vocab, n_ctx)
+
+    criterion = nn.CrossEntropyLoss(reduce=False)
+    model_opt = OpenAIAdam(dh_model.parameters(),
+                           lr=args.lr,
+                           schedule=args.lr_schedule,
+                           warmup=args.lr_warmup,
+                           t_total=n_updates_total,
+                           b1=args.b1,
+                           b2=args.b2,
+                           e=args.e,
+                           l2=args.l2,
+                           vector_l2=args.vector_l2,
+                           max_grad_norm=args.max_grad_norm)
+    compute_loss_fct = MultipleChoiceLossCompute(criterion,
+                                                 criterion,
+                                                 args.lm_coef,
+                                                 model_opt)
+    load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special)
+
+    dh_model.to(device)
+    dh_model = nn.DataParallel(dh_model)
+
+    n_updates = 0
+    n_epochs = 0
+    if dataset != 'stsb':
+        trYt = trY
+    if submit:
+        path = os.path.join(save_dir, desc, 'best_params')
+        torch.save(dh_model.state_dict(), make_path(path))
+    best_score = 0
+    for i in range(args.n_iter):
+        print("running epoch", i)
+        run_epoch()
+        n_epochs += 1
+        log(save_dir, desc)
+    if submit:
+        path = os.path.join(save_dir, desc, 'best_params')
+        dh_model.load_state_dict(torch.load(path))
+        predict(dataset, args.submission_dir)
+        if args.analysis:
+            rocstories_analysis(data_dir, os.path.join(args.submission_dir, 'ROCStories.tsv'),
+                                os.path.join(log_dir, 'rocstories.jsonl'))
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 021d2334ca..6a05873b20 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -659,10 +659,10 @@ class BertForPreTraining(BertPreTrainedModel):
             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
             input sequence length in the current batch. It's the mask that we typically use for attention when
             a batch has varying length sentences.
-        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
             with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
             is only computed for the labels set in [0, ..., vocab_size]
-        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
+        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
             with indices selected in [0, 1].
             0 => next sentence is the continuation, 1 => next sentence is a random sentence.
 
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index bde481c7b1..40557c9626 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -149,19 +149,19 @@ class Conv1D(nn.Module):
 
 
 class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, cfg, scale=False):
+    def __init__(self, nx, n_ctx, config, scale=False):
         super(Attention, self).__init__()
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implem]
-        assert n_state % cfg.n_head == 0
+        assert n_state % config.n_head == 0
         self.register_buffer('b', torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
-        self.n_head = cfg.n_head
+        self.n_head = config.n_head
         self.split_size = n_state
         self.scale = scale
         self.c_attn = Conv1D(n_state * 3, 1, nx)
         self.c_proj = Conv1D(n_state, 1, nx)
-        self.attn_dropout = nn.Dropout(cfg.attn_pdrop)
-        self.resid_dropout = nn.Dropout(cfg.resid_pdrop)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
 
     def _attn(self, q, k, v):
         w = torch.matmul(q, k)
@@ -203,13 +203,13 @@ class Attention(nn.Module):
 
 
 class MLP(nn.Module):
-    def __init__(self, n_state, cfg):  # in MLP: n_state=3072 (4 * n_embd)
+    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
         super(MLP, self).__init__()
-        nx = cfg.n_embd
+        nx = config.n_embd
         self.c_fc = Conv1D(n_state, 1, nx)
         self.c_proj = Conv1D(nx, 1, n_state)
-        self.act = ACT_FNS[cfg.afn]
-        self.dropout = nn.Dropout(cfg.resid_pdrop)
+        self.act = ACT_FNS[config.afn]
+        self.dropout = nn.Dropout(config.resid_pdrop)
 
     def forward(self, x):
         h = self.act(self.c_fc(x))
@@ -218,12 +218,12 @@ class MLP(nn.Module):
 
 
 class Block(nn.Module):
-    def __init__(self, n_ctx, cfg, scale=False):
+    def __init__(self, n_ctx, config, scale=False):
         super(Block, self).__init__()
-        nx = cfg.n_embd
-        self.attn = Attention(nx, n_ctx, cfg, scale)
+        nx = config.n_embd
+        self.attn = Attention(nx, n_ctx, config, scale)
         self.ln_1 = LayerNorm(nx)
-        self.mlp = MLP(4 * nx, cfg)
+        self.mlp = MLP(4 * nx, config)
         self.ln_2 = LayerNorm(nx)
 
     def forward(self, x):
@@ -237,9 +237,9 @@ class Block(nn.Module):
 class OpenAIGPTLMHead(nn.Module):
     """ Language Model Head for the transformer """
 
-    def __init__(self, model_embeddings_weights, cfg):
+    def __init__(self, model_embeddings_weights, config):
         super(OpenAIGPTLMHead, self).__init__()
-        self.n_embd = cfg.n_embd
+        self.n_embd = config.n_embd
         self.set_embeddings_weights(model_embeddings_weights)
 
     def set_embeddings_weights(self, model_embeddings_weights):
@@ -257,12 +257,12 @@ class OpenAIGPTLMHead(nn.Module):
 class OpenAIGPTMultipleChoiceHead(nn.Module):
     """ Classifier Head for the transformer """
 
-    def __init__(self, cfg):
+    def __init__(self, config):
         super(OpenAIGPTMultipleChoiceHead, self).__init__()
-        self.n_embd = cfg.n_embd
+        self.n_embd = config.n_embd
         # self.multiple_choice_token = multiple_choice_token
-        self.dropout = nn.Dropout2d(cfg.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
-        self.linear = nn.Linear(cfg.n_embd, 1)
+        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
+        self.linear = nn.Linear(config.n_embd, 1)
 
         nn.init.normal_(self.linear.weight, std = 0.02)
         nn.init.normal_(self.linear.bias, 0)
@@ -428,15 +428,63 @@ class OpenAIGPTPreTrainedModel(nn.Module):
 
 
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
-    """ OpenAI GPT model """
+    """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
 
-    def __init__(self, cfg):
-        super(OpenAIGPTModel, self).__init__(cfg)
-        total_embeddings_size = cfg.vocab_size + cfg.n_special + cfg.n_ctx
-        self.embed = nn.Embedding(total_embeddings_size, cfg.n_embd)
-        self.drop = nn.Dropout(cfg.embd_pdrop)
-        block = Block(cfg.n_ctx, cfg, scale=True)
-        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(cfg.n_layer)])
+    The main implementation difference between BERT and the OpenAI is the use, in OpenAI GPT, of a single embedding matrix
+    to store the word, special ([SEP], [CLS]...) and position embeddings.
+    The embeddings are ordered as follow in the word embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1,                  ______________________
+         config.vocab_size + config.n_special,
+         ...                                                        -> position embeddings
+         total_num_embeddings - 1]                                  ______________________
+
+    where total_num_embeddings can be obtained as config.total_num_embeddings and is:
+        total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
+    You should use the associate indices to index the embeddings.
+
+    The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    Params:
+        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_ctx - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third embedding (the previous two being the word and position embeddings)
+            to each token in the sentence.
+
+    Outputs:
+        `hidden_states`: the encoded-hidden-states at the top of the model
+            as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
+            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+    config = modeling_openai.OpenAIGPTConfig()
+
+    model = modeling_openai.OpenAIGPTModel(config)
+    hidden_states = model(input_ids)
+    ```
+    """
+    def __init__(self, config):
+        super(OpenAIGPTModel, self).__init__(config)
+        total_embeddings_size = config.vocab_size + config.n_special + config.n_ctx
+        self.embed = nn.Embedding(total_embeddings_size, config.n_embd)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        block = Block(config.n_ctx, config, scale=True)
+        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
 
         self.apply(self.init_weights)
         # nn.init.normal_(self.embed.weight, std=0.02)
@@ -480,11 +528,67 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         return hidden_states.view(*input_shape, hidden_states.size(-1))
 
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
-    """ OpenAI GPT model with language model and classification heads """
-    def __init__(self, cfg):
-        super(OpenAIGPTLMHeadModel, self).__init__(cfg)
-        self.transformer = OpenAIGPTModel(cfg)
-        self.lm_head = OpenAIGPTLMHead(self.transformer.embed.weight, cfg)
+    """OpenAI GPT model with a Language Modeling head ("Improving Language Understanding by Generative Pre-Training").
+
+    There are two main implementation differences between BERT and the OpenAI GPT:
+        - the use of an LM loss in OpenAI GPT which means the Transformer is trained to predict the NEXT token for each input token
+            vs. predict the SAME token for BERT (i.e. you need to shift your labels to the right)
+        - the use, in OpenAI GPT, of a single embedding matrix to store the word, special ([SEP], [CLS]...) and position embeddings.
+    The embeddings are ordered as follow in the word embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1,                  ______________________
+         config.vocab_size + config.n_special,
+         ...                                                        -> position embeddings
+         total_num_embeddings - 1]                                  ______________________
+
+    where total_num_embeddings can be obtained as config.total_num_embeddings and is:
+        total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
+    You should use these indices to index the word, special and position embeddings.
+
+    The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    Params:
+        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_ctx - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third embedding (the previous two being the word and position embeddings)
+            to each token in the sentence.
+        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+
+    Outputs:
+        if `lm_labels` is not `None`:
+            Outputs the language modeling loss.
+        else:
+            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_num_embeddings]
+                (or more generally [d_1, ..., d_n, total_num_embeddings] were d_1 ... d_n are the dimension of input_ids)
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+    config = modeling_openai.OpenAIGPTConfig()
+
+    model = modeling_openai.OpenAIGPTLMHeadModel(config)
+    lm_logits = model(input_ids)
+    ```
+    """
+    def __init__(self, config):
+        super(OpenAIGPTLMHeadModel, self).__init__(config)
+        self.transformer = OpenAIGPTModel(config)
+        self.lm_head = OpenAIGPTLMHead(self.transformer.embed.weight, config)
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens):
@@ -502,12 +606,74 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         return lm_logits
 
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
-    """ OpenAI GPT model with language model and classification heads """
-    def __init__(self, cfg):
-        super(OpenAIGPTDoubleHeadsModel, self).__init__(cfg)
-        self.transformer = OpenAIGPTModel(cfg)
-        self.lm_head = OpenAIGPTLMHead(self.transformer.embed.weight, cfg)
-        self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(cfg)
+    """OpenAI GPT model with a Language Modeling and a Multiple Choice heads ("Improving Language Understanding by Generative Pre-Training").
+
+    There are two main implementation differences between BERT and the OpenAI GPT:
+        - the use of an LM loss in OpenAI GPT which means the Transformer is trained to predict the NEXT token for each input token
+            vs. predict the SAME token for BERT (i.e. you need to shift your labels to the right)
+        - the use, in OpenAI GPT, of a single embedding matrix to store the word, special ([SEP], [CLS]...) and position embeddings.
+    The embeddings are ordered as follow in the word embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1,                  ______________________
+         config.vocab_size + config.n_special,
+         ...                                                        -> position embeddings
+         total_num_embeddings - 1]                                  ______________________
+
+    where total_num_embeddings can be obtained as config.total_num_embeddings and is:
+        total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
+    You should use these indices to index the word, special and position embeddings.
+
+    The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    Params:
+        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with the word BPE token indices selected in the range [0, config.vocab_size[
+        `multiple_choice_token_mask`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with a value of 1 were the last hidden state is (usually the [CLS] token) and 0 otherwise.
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [config.vocab_size + config.n_special,
+            config.vocab_size + config.n_special + config.n_ctx - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third embedding (the previous two being the word and position embeddings)
+            to each token in the sentence.
+        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with indices selected in [-1, 0, ..., total_num_embeddings]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., total_num_embeddings]
+        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_choices].
+
+    Outputs:
+        if `lm_labels` and `multiple_choice_labels` are not `None`:
+            Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
+        else: a tuple with
+            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_num_embeddings]
+            `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    multiple_choice_token_mask = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling_openai.OpenAIGPTConfig()
+
+    model = modeling_openai.OpenAIGPTLMHeadModel(config)
+    lm_logits, multiple_choice_logits = model(input_ids, multiple_choice_token_mask)
+    ```
+    """
+    def __init__(self, config):
+        super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
+        self.transformer = OpenAIGPTModel(config)
+        self.lm_head = OpenAIGPTLMHead(self.transformer.embed.weight, config)
+        self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config)
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens):
@@ -517,9 +683,6 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
 
     def forward(self, input_ids, multiple_choice_token_mask, position_ids=None, token_type_ids=None,
                 lm_labels=None, multiple_choice_labels=None):
-        """ input_ids should be of shape B x C x S
-            lm_labels can be masked using the -1 value
-        """
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
         lm_logits = self.lm_head(hidden_states)
         multiple_choice_logits = self.multiple_choice_head(hidden_states, multiple_choice_token_mask)

From fa5222c29617edcca3662ebcdcbea79de10bd329 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 10 Jan 2019 01:25:28 +0100
Subject: [PATCH 06/82] update readme

---
 README.md                  | 224 ++++++++++++++++++++++++---
 examples/run_openai_gpt.py | 304 -------------------------------------
 2 files changed, 206 insertions(+), 322 deletions(-)
 delete mode 100644 examples/run_openai_gpt.py

diff --git a/README.md b/README.md
index 915ccf635a..54291e0779 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,14 @@
-# PyTorch Pretrained Bert
+# PyTorch Pretrained Bert - PyTorch Pretrained OpenAI GPT
 
 [![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT)
 
-This repository contains an op-for-op PyTorch reimplementation of [Google's TensorFlow repository for the BERT model](https://github.com/google-research/bert) that was released together with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+This repository contains an op-for-op PyTorch reimplementation of [Google's TensorFlow repository for the BERT model](https://github.com/google-research/bert) and of [OpenAI's TensorFlow repository for the OpenAI GPT model](https://github.com/openai/finetune-transformer-lm)
 
-This implementation is provided with [Google's pre-trained models](https://github.com/google-research/bert), examples, notebooks and a command-line interface to load any pre-trained TensorFlow checkpoint for BERT is also provided.
+BERT that was released together with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+This PyTorch implementation of BERT is provided with [Google's pre-trained models](https://github.com/google-research/bert), examples, notebooks and a command-line interface to load any pre-trained TensorFlow checkpoint for BERT is also provided.
+
+OpenAI GPT that was released together with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+This PyTorch implementation of OpenAI GPT is provided with [OpenAI's pre-trained model](https://github.com/openai/finetune-transformer-lm) and a command-line interface that was used to convert the pre-trained NumPy checkpoint in the provided PyTorch model.
 
 ## Content
 
@@ -58,17 +62,31 @@ This package comprises the following classes that can be imported in Python and
   - [`BertForTokenClassification`](./pytorch_pretrained_bert/modeling.py#L949) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**),
   - [`BertForQuestionAnswering`](./pytorch_pretrained_bert/modeling.py#L1015) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**).
 
-- Three tokenizers (in the [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) file):
+- Three PyTorch models (`torch.nn.Module`) for OpenAI with pre-trained weights (in the [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py) file):
+  - [`OpenAIGPTModel`](./pytorch_pretrained_bert/modeling_openai.py#L537) - raw OpenAI GPT Transformer model (**fully pre-trained**),
+  - [`OpenAIGPTLMHeadModel`](./pytorch_pretrained_bert/modeling_openai.py#L691) - OpenAI GPT Transformer with the tied language modeling head on top (**fully pre-trained**),
+  - [`OpenAIGPTDoubleHeadsModel`](./pytorch_pretrained_bert/modeling_openai.py#L752) - OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
+
+- Three tokenizers for BERT (in the [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) file):
   - `BasicTokenizer` - basic tokenization (punctuation splitting, lower casing, etc.),
   - `WordpieceTokenizer` - WordPiece tokenization,
   - `BertTokenizer` - perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
 
-- One optimizer (in the [`optimization.py`](./pytorch_pretrained_bert/optimization.py) file):
+- One tokenizers for OpenAI GPT (in the [`tokenization_openai.py`](./pytorch_pretrained_bert/tokenization_openai.py) file):
+  - `OpenAIGPTTokenizer` - perform Byte-Pair-Encoding (BPE) tokenization,
+
+- One optimizer for BERT (in the [`optimization.py`](./pytorch_pretrained_bert/optimization.py) file):
   - `BertAdam` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
-- A configuration class (in the [`modeling.py`](./pytorch_pretrained_bert/modeling.py) file):
+- One optimizer for OpenAI GPT (in the [`optimization_openai.py`](./pytorch_pretrained_bert/optimization_openai.py) file):
+  - `OpenAIGPTAdam` - OpenAI GPT version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
+
+- A configuration class for BERT (in the [`modeling.py`](./pytorch_pretrained_bert/modeling.py) file):
   - `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilities to read and write from JSON configuration files.
 
+- A configuration class for OpenAI GPT (in the [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py) file):
+  - `OpenAIGPTConfig` - Configuration class to store the configuration of a `OpenAIGPTModel` with utilities to read and write from JSON configuration files.
+
 The repository further comprises:
 
 - Five examples on how to use Bert (in the [`examples` folder](./examples)):
@@ -87,12 +105,14 @@ The repository further comprises:
 
   These notebooks are detailed in the [Notebooks](#notebooks) section of this readme.
 
-- A command-line interface to convert any TensorFlow checkpoint in a PyTorch dump:
+- A command-line interface to convert any TensorFlow checkpoint (BERT) and NumPy checkpoint (OpenAI) in a PyTorch dump:
 
   This CLI is detailed in the [Command-line interface](#Command-line-interface) section of this readme.
 
 ## Usage
 
+### BERT
+
 Here is a quick-start example using `BertTokenizer`, `BertModel` and `BertForMaskedLM` class with Google AI's pre-trained `Bert base uncased` model. See the [doc section](#doc) below for all the details on these classes.
 
 First let's prepare a tokenized input with `BertTokenizer`
@@ -152,20 +172,70 @@ predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
 assert predicted_token == 'henson'
 ```
 
+### OpenAI GPT
+
+Here is a quick-start example using `OpenAIGPTTokenizer`, `OpenAIGPTModel` and `OpenAIGPTLMHeadModel` class with OpenAI's pre-trained  model. See the [doc section](#doc) below for all the details on these classes.
+
+First let's prepare a tokenized input with `OpenAIGPTTokenizer`
+
+```python
+import torch
+from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
+
+# Load pre-trained model tokenizer (vocabulary)
+tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+
+# Tokenized input
+text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+tokenized_text = tokenizer.tokenize(text)
+
+# Convert token to vocabulary indices
+indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+
+# Convert inputs to PyTorch tensors
+tokens_tensor = torch.tensor([indexed_tokens])
+```
+
+Let's see how to use `OpenAIGPTModel` to get hidden states
+
+```python
+# Load pre-trained model (weights)
+model = OpenAIGPTModel.from_pretrained('openai-gpt')
+model.eval()
+
+# Predict hidden states features for each layer
+hidden_states = model(tokens_tensor, segments_tensors)
+```
+
+And how to use `OpenAIGPTLMHeadModel`
+
+```python
+# Load pre-trained model (weights)
+model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
+model.eval()
+
+# Predict all tokens
+predictions = model(tokens_tensor)
+
+# get the predicted last token
+predicted_index = torch.argmax(predictions[0, masked_index]).item()
+predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+```
+
 ## Doc
 
 Here is a detailed documentation of the classes in the package and how to use them:
 
 | Sub-section | Description |
 |-|-|
-| [Loading Google AI's pre-trained weigths](#Loading-Google-AIs-pre-trained-weigths-and-PyTorch-dump) | How to load Google AI's pre-trained weight or a PyTorch saved instance |
+| [Loading Google AI's/OpenAI's pre-trained weigths](#Loading-Google-AI-or-OpenAI-pre-trained-weigths-and-PyTorch-dump) | How to load Google AI/OpenAI's pre-trained weight or a PyTorch saved instance |
 | [PyTorch models](#PyTorch-models) | API of the eight PyTorch model classes: `BertModel`, `BertForMaskedLM`, `BertForNextSentencePrediction`, `BertForPreTraining`, `BertForSequenceClassification`, `BertForMultipleChoice` or `BertForQuestionAnswering` |
 | [Tokenizer: `BertTokenizer`](#Tokenizer-BertTokenizer) | API of the `BertTokenizer` class|
 | [Optimizer: `BertAdam`](#Optimizer-BertAdam) |  API of the `BertAdam` class |
 
-### Loading Google AI's pre-trained weigths and PyTorch dump
+### Loading Google AI or OpenAI pre-trained weigths or PyTorch dump
 
-To load one of Google AI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated as
+To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated as
 
 ```python
 model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None)
@@ -173,10 +243,10 @@ model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=Non
 
 where
 
-- `BERT_CLASS` is either the `BertTokenizer` class (to load the vocabulary) or one of the eight PyTorch model classes (to load the pre-trained weights): `BertModel`, `BertForMaskedLM`, `BertForNextSentencePrediction`, `BertForPreTraining`, `BertForSequenceClassification`, `BertForTokenClassification`, `BertForMultipleChoice` or `BertForQuestionAnswering`, and
+- `BERT_CLASS` is either a tokenizer to load the vocabulary (`BertTokenizer` or `OpenAIGPTTokenizer` classes) or one of the eight BERT or three OpenAI GPT PyTorch model classes (to load the pre-trained weights): `BertModel`, `BertForMaskedLM`, `BertForNextSentencePrediction`, `BertForPreTraining`, `BertForSequenceClassification`, `BertForTokenClassification`, `BertForMultipleChoice`, `BertForQuestionAnswering`, `OpenAIGPTModel`, `OpenAIGPTLMHeadModel` or `OpenAIGPTDoubleHeadsModel`, and
 - `PRE_TRAINED_MODEL_NAME_OR_PATH` is either:
 
-  - the shortcut name of a Google AI's pre-trained model selected in the list:
+  - the shortcut name of a Google AI's or OpenAI's pre-trained model selected in the list:
 
     - `bert-base-uncased`: 12-layer, 768-hidden, 12-heads, 110M parameters
     - `bert-large-uncased`: 24-layer, 1024-hidden, 16-heads, 340M parameters
@@ -185,11 +255,12 @@ where
     - `bert-base-multilingual-uncased`: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
     - `bert-base-multilingual-cased`: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
     - `bert-base-chinese`: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
+    - `openai-gpt`: OpenAI English model, 12-layer, 768-hidden, 12-heads, 110M parameters
 
   - a path or url to a pretrained model archive containing:
 
-    - `bert_config.json` a configuration file for the model, and
-    - `pytorch_model.bin` a PyTorch dump of a pre-trained instance `BertForPreTraining` (saved with the usual `torch.save()`)
+    - `bert_config.json` or `openai_gpt_config.json` a configuration file for the model, and
+    - `pytorch_model.bin` a PyTorch dump of a pre-trained instance of `BertForPreTraining` or `OpenAIGPTModel` (saved with the usual `torch.save()`)
 
   If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_pretrained_bert/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_pretrained_bert/`).
 - `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information).
@@ -198,10 +269,15 @@ where
 
 **When using an `uncased model`, make sure to pass `--do_lower_case` to the example training scripts (or pass `do_lower_case=True` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).**
 
-Example:
+Examples:
 ```python
+# BERT
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
 model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+# OpenAI GPT
+tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+model = OpenAIGPTModel.from_pretrained('openai-gpt')
 ```
 
 ### PyTorch models
@@ -311,7 +387,78 @@ The token-level classifier takes as input the full sequence of the last hidden s
 
 An example on how to use this class is given in the [`run_squad.py`](./examples/run_squad.py) script which can be used to fine-tune a token classifier using BERT, for example for the SQuAD task.
 
-### Tokenizer: `BertTokenizer`
+#### 9. `OpenAIGPTModel`
+
+`OpenAIGPTModel` is the basic OpenAI GPT Transformer model with a layer of summed token and position embeddings followed by a series of 12 identical self-attention blocks.
+
+The main implementation difference between BERT and the OpenAI is the use, in OpenAI GPT, of a single embedding matrix to store the word, special (`[SEP]`, `[CLS]`...) token and position embeddings.
+The embeddings are ordered as follow in the word embeddings matrice:
+
+    [0,                                                         ----------------------
+      ...                                                        -> word embeddings
+      config.vocab_size - 1,                                     ______________________
+      config.vocab_size,
+      ...                                                        -> special embeddings
+      config.vocab_size + config.n_special - 1,                  ______________________
+      config.vocab_size + config.n_special,
+      ...                                                        -> position embeddings
+      total_num_embeddings - 1]                                  ______________________
+
+where total_num_embeddings can be obtained as config.total_num_embeddings and is:
+
+    total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
+You should use the associate indices to index the embeddings.
+
+The special tokens embeddings (`[SEP]`, `[CLS]`...) are not pre-trained and need to be trained during the fine-tuning if you use them.
+
+The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+The inputs and output are **identical to the TensorFlow model inputs and outputs**.
+
+We detail them here. This model takes as *inputs*:
+[`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py)
+- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
+- `position_ids`: an optional torch.LongTensor with the same shape as input_ids with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_ctx - 1[.
+- `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids. You can use it to add a third embedding (the previous two being the word and position embeddings) to each token in the sentence.
+
+This model *outputs*:
+- `hidden_states`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+
+#### 10. `OpenAIGPTLMHeadModel`
+
+`OpenAIGPTLMHeadModel` includes the `OpenAIGPTModel` Transformer followed by a language modeling head with weights tied to the input embeddings (no additional parameters).
+
+*Inputs* are the same as the inputs of the [`OpenAIGPTModel`](#-9.-`OpenAIGPTModel`) class plus optional labels:
+- `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size].
+
+*Outputs*:
+- if `lm_labels` is not `None`:
+  Outputs the language modeling loss.
+- else:
+  Outputs `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_num_embeddings] (or more generally [d_1, ..., d_n, total_num_embeddings] were d_1 ... d_n are the dimension of input_ids)
+
+#### 11. `OpenAIGPTDoubleHeadsModel`
+
+`OpenAIGPTDoubleHeadsModel` includes the `OpenAIGPTModel` Transformer followed by two heads:
+- a language modeling head with weights tied to the input embeddings (no additional parameters) and:
+- a multiple choice classifier (linear layer).
+
+*Inputs* are the same as the inputs of the [`OpenAIGPTModel`](#-9.-`OpenAIGPTModel`) class plus a classification mask and two optional labels:
+- `multiple_choice_token_mask`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with a value of 1 were the last hidden state is (usually the [CLS] token) and 0 otherwise.
+- `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size].
+- `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size] with indices selected in [0, ..., num_choices].
+
+*Outputs*:
+- if `lm_labels` and `multiple_choice_labels` are not `None`:
+  Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
+- else Outputs a tuple with:
+  - `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_num_embeddings]
+  - `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
+
+
+### Tokenizers:
+
+#### `BertTokenizer`
 
 `BertTokenizer` perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
 
@@ -328,7 +475,26 @@ and three methods:
 
 Please refer to the doc strings and code in [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) for the details of the `BasicTokenizer` and `WordpieceTokenizer` classes. In general it is recommended to use `BertTokenizer` unless you know what you are doing.
 
-### Optimizer: `BertAdam`
+#### `OpenAIGPTTokenizer`
+
+`OpenAIGPTTokenizer` perform Byte-Pair-Encoding (BPE) tokenization.
+
+This class has one arguments:
+
+- `vocab_file`: path to a vocabulary file.
+- `merges_file`: path to a file containing the BPE merges.
+
+and three methods:
+
+- `tokenize(text)`: convert a `str` in a list of `str` tokens by (1) performing basic tokenization and (2) WordPiece tokenization.
+- `convert_tokens_to_ids(tokens)`: convert a list of `str` tokens in a list of `int` indices in the vocabulary.
+- `convert_ids_to_tokens(tokens)`: convert a list of `int` indices in a list of `str` tokens in the vocabulary.
+
+Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch_pretrained_bert/tokenization_openai.py) for the details of the `OpenAIGPTTokenizer`.
+
+### Optimizers:
+
+#### `BertAdam`
 
 `BertAdam` is a `torch.optimizer` adapted to be closer to the optimizer used in the TensorFlow implementation of Bert. The differences with PyTorch Adam optimizer are the following:
 
@@ -348,6 +514,13 @@ The optimizer accepts the following arguments:
 - `weight_decay:` Weight decay. Default : `0.01`
 - `max_grad_norm` : Maximum norm for the gradients (`-1` means no clipping). Default : `1.0`
 
+#### `OpenAIGPTAdam`
+
+`OpenAIGPTAdam` is similar to `BertAdam`.
+The differences with `BertAdam` is that `OpenAIGPTAdam` compensate for bias as in the regular Adam optimizer.
+
+`OpenAIGPTAdam` accepts the same arguments as `BertAdam`.
+
 ## Examples
 
 | Sub-section | Description |
@@ -587,7 +760,9 @@ Please follow the instructions given in the notebooks to run and modify them.
 
 ## Command-line interface
 
-A command-line interface is provided to convert a TensorFlow checkpoint in a PyTorch dump of the `BertForPreTraining` class  (see above).
+A command-line interface is provided to convert a TensorFlow checkpoint in a PyTorch dump of the `BertForPreTraining` class  (for BERT) or NumPy checkpoint in a PyTorch dump of the `OpenAIGPTModel` class  (for OpenAI GPT).
+
+### BERT
 
 You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`./pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py`](convert_tf_checkpoint_to_pytorch.py) script.
 
@@ -610,6 +785,19 @@ pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch \
 
 You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/bert#pre-trained-models).
 
+### OpenAI GPT
+
+Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoit save as the same format than OpenAI pretrained model (see [here](https://github.com/openai/finetune-transformer-lm))
+
+```shell
+export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
+
+pytorch_pretrained_bert convert_openai_checkpoint \
+  $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
+  $PYTORCH_DUMP_OUTPUT \
+  [OPENAI_GPT_CONFIG]
+```
+
 ## TPU
 
 TPU support and pretraining scripts
diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
deleted file mode 100644
index 4d00276d88..0000000000
--- a/examples/run_openai_gpt.py
+++ /dev/null
@@ -1,304 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-" Run OpenAI GPT on RocStories"
-import argparse
-import os
-import random
-import logging
-
-from sklearn.metrics import accuracy_score
-from sklearn.utils import shuffle
-
-# from analysis import rocstories as rocstories_analysis
-# from datasets import rocstories
-# from model_pytorch import DoubleHeadModel, load_openai_pretrained_model
-# from opt import OpenAIAdam
-# from text_utils import TextEncoder
-# from utils import (encode_dataset, iter_data,
-#                    ResultLogger, make_path)
-# from loss import MultipleChoiceLossCompute
-
-import numpy as np
-import torch
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
-
-from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
-from pytorch_pretrained_bert.modeling_openai import OpenAIGPTDoubleHeadsModel
-from pytorch_pretrained_bert.optimization_openai import OpenAIAdam
-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
-
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
-logger = logging.getLogger(__name__)
-
-def transform_roc(X1, X2, X3):
-    n_batch = len(X1)
-    xmb = np.zeros((n_batch, 2, n_ctx, 2), dtype=np.int32)
-    mmb = np.zeros((n_batch, 2, n_ctx), dtype=np.float32)
-    start = encoder['_start_']
-    delimiter = encoder['_delimiter_']
-    for i, (x1, x2, x3), in enumerate(zip(X1, X2, X3)):
-        x12 = [start] + x1[:max_len] + [delimiter] + x2[:max_len] + [clf_token]
-        x13 = [start] + x1[:max_len] + [delimiter] + x3[:max_len] + [clf_token]
-        l12 = len(x12)
-        l13 = len(x13)
-        xmb[i, 0, :l12, 0] = x12
-        xmb[i, 1, :l13, 0] = x13
-        mmb[i, 0, :l12] = 1
-        mmb[i, 1, :l13] = 1
-    # Position information that is added to the input embeddings in the TransformerModel
-    xmb[:, :, :, 1] = np.arange(n_vocab + n_special, n_vocab + n_special + n_ctx)
-    return xmb, mmb
-
-
-def iter_apply(Xs, Ms, Ys):
-    # fns = [lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x))]
-    logits = []
-    cost = 0
-    with torch.no_grad():
-        dh_model.eval()
-        for xmb, mmb, ymb in iter_data(Xs, Ms, Ys, n_batch=n_batch_train, truncate=False, verbose=True):
-            n = len(xmb)
-            XMB = torch.tensor(xmb, dtype=torch.long).to(device)
-            YMB = torch.tensor(ymb, dtype=torch.long).to(device)
-            MMB = torch.tensor(mmb).to(device)
-            _, clf_logits = dh_model(XMB)
-            clf_logits *= n
-            clf_losses = compute_loss_fct(XMB, YMB, MMB, clf_logits, only_return_losses=True)
-            clf_losses *= n
-            logits.append(clf_logits.to("cpu").numpy())
-            cost += clf_losses.sum().item()
-        logits = np.concatenate(logits, 0)
-    return logits, cost
-
-
-def iter_predict(Xs, Ms):
-    logits = []
-    with torch.no_grad():
-        dh_model.eval()
-        for xmb, mmb in iter_data(Xs, Ms, n_batch=n_batch_train, truncate=False, verbose=True):
-            n = len(xmb)
-            XMB = torch.tensor(xmb, dtype=torch.long).to(device)
-            MMB = torch.tensor(mmb).to(device)
-            _, clf_logits = dh_model(XMB)
-            logits.append(clf_logits.to("cpu").numpy())
-    logits = np.concatenate(logits, 0)
-    return logits
-
-
-def log(save_dir, desc):
-    global best_score
-    print("Logging")
-    tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid], trY[:n_valid])
-    va_logits, va_cost = iter_apply(vaX, vaM, vaY)
-    tr_cost = tr_cost / len(trY[:n_valid])
-    va_cost = va_cost / n_valid
-    tr_acc = accuracy_score(trY[:n_valid], np.argmax(tr_logits, 1)) * 100.
-    va_acc = accuracy_score(vaY, np.argmax(va_logits, 1)) * 100.
-    logger.log(n_epochs=n_epochs, n_updates=n_updates, tr_cost=tr_cost, va_cost=va_cost, tr_acc=tr_acc, va_acc=va_acc)
-    print('%d %d %.3f %.3f %.2f %.2f' % (n_epochs, n_updates, tr_cost, va_cost, tr_acc, va_acc))
-    if submit:
-        score = va_acc
-        if score > best_score:
-            best_score = score
-            path = os.path.join(save_dir, desc, 'best_params')
-            torch.save(dh_model.state_dict(), make_path(path))
-
-
-def predict(dataset, submission_dir):
-    filename = filenames[dataset]
-    pred_fn = pred_fns[dataset]
-    label_decoder = label_decoders[dataset]
-    predictions = pred_fn(iter_predict(teX, teM))
-    if label_decoder is not None:
-        predictions = [label_decoder[prediction] for prediction in predictions]
-    path = os.path.join(submission_dir, filename)
-    os.makedirs(os.path.dirname(path), exist_ok=True)
-    with open(path, 'w') as f:
-        f.write('{}\t{}\n'.format('index', 'prediction'))
-        for i, prediction in enumerate(predictions):
-            f.write('{}\t{}\n'.format(i, prediction))
-
-
-def run_epoch():
-    for xmb, mmb, ymb in iter_data(*shuffle(trX, trM, trYt, random_state=np.random),
-                                   n_batch=n_batch_train, truncate=True, verbose=True):
-        global n_updates
-        dh_model.train()
-        XMB = torch.tensor(xmb, dtype=torch.long).to(device)
-        YMB = torch.tensor(ymb, dtype=torch.long).to(device)
-        MMB = torch.tensor(mmb).to(device)
-        lm_logits, clf_logits = dh_model(XMB)
-        compute_loss_fct(XMB, YMB, MMB, clf_logits, lm_logits)
-        n_updates += 1
-        if n_updates in [1000, 2000, 4000, 8000, 16000, 32000] and n_epochs == 0:
-            log(save_dir, desc)
-
-
-argmax = lambda x: np.argmax(x, 1)
-
-pred_fns = {
-    'rocstories': argmax,
-}
-
-filenames = {
-    'rocstories': 'ROCStories.tsv',
-}
-
-label_decoders = {
-    'rocstories': None,
-}
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--desc', type=str, help="Description")
-    parser.add_argument('--dataset', type=str)
-    parser.add_argument('--log_dir', type=str, default='log/')
-    parser.add_argument('--save_dir', type=str, default='save/')
-    parser.add_argument('--data_dir', type=str, default='data/')
-    parser.add_argument('--submission_dir', type=str, default='submission/')
-    parser.add_argument('--submit', action='store_true')
-    parser.add_argument('--analysis', action='store_true')
-    parser.add_argument('--seed', type=int, default=42)
-    parser.add_argument('--n_iter', type=int, default=3)
-    parser.add_argument('--n_batch', type=int, default=8)
-    parser.add_argument('--max_grad_norm', type=int, default=1)
-    parser.add_argument('--lr', type=float, default=6.25e-5)
-    parser.add_argument('--lr_warmup', type=float, default=0.002)
-    parser.add_argument('--n_ctx', type=int, default=512)
-    parser.add_argument('--n_embd', type=int, default=768)
-    parser.add_argument('--n_head', type=int, default=12)
-    parser.add_argument('--n_layer', type=int, default=12)
-    parser.add_argument('--embd_pdrop', type=float, default=0.1)
-    parser.add_argument('--attn_pdrop', type=float, default=0.1)
-    parser.add_argument('--resid_pdrop', type=float, default=0.1)
-    parser.add_argument('--clf_pdrop', type=float, default=0.1)
-    parser.add_argument('--l2', type=float, default=0.01)
-    parser.add_argument('--vector_l2', action='store_true')
-    parser.add_argument('--opt', type=str, default='adam')
-    parser.add_argument('--afn', type=str, default='gelu')
-    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
-    parser.add_argument('--encoder_path', type=str, default='model/encoder_bpe_40000.json')
-    parser.add_argument('--bpe_path', type=str, default='model/vocab_40000.bpe')
-    parser.add_argument('--n_transfer', type=int, default=12)
-    parser.add_argument('--lm_coef', type=float, default=0.5)
-    parser.add_argument('--b1', type=float, default=0.9)
-    parser.add_argument('--b2', type=float, default=0.999)
-    parser.add_argument('--e', type=float, default=1e-8)
-    parser.add_argument('--n_valid', type=int, default=374)
-
-    args = parser.parse_args()
-    print(args)
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    torch.cuda.manual_seed_all(args.seed)
-
-    # Constants
-    submit = args.submit
-    dataset = args.dataset
-    n_ctx = args.n_ctx
-    save_dir = args.save_dir
-    desc = args.desc
-    data_dir = args.data_dir
-    log_dir = args.log_dir
-    submission_dir = args.submission_dir
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    n_gpu = torch.cuda.device_count()
-    print("device", device, "n_gpu", n_gpu)
-
-    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
-    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
-    encoder = text_encoder.encoder
-    n_vocab = len(text_encoder.encoder)
-
-    print("Encoding dataset...")
-    ((trX1, trX2, trX3, trY),
-     (vaX1, vaX2, vaX3, vaY),
-     (teX1, teX2, teX3)) = encode_dataset(*rocstories(data_dir, n_valid=args.n_valid),
-                                          encoder=text_encoder)
-    encoder['_start_'] = len(encoder)
-    encoder['_delimiter_'] = len(encoder)
-    encoder['_classify_'] = len(encoder)
-    clf_token = encoder['_classify_']
-    n_special = 3
-    max_len = n_ctx // 2 - 2
-    n_ctx = min(max(
-        [len(x1[:max_len]) + max(len(x2[:max_len]),
-                                 len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)]
-        + [len(x1[:max_len]) + max(len(x2[:max_len]),
-                                   len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)]
-        + [len(x1[:max_len]) + max(len(x2[:max_len]),
-                                   len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)]
-        ) + 3, n_ctx)
-    vocab = n_vocab + n_special + n_ctx
-    trX, trM = transform_roc(trX1, trX2, trX3)
-    vaX, vaM = transform_roc(vaX1, vaX2, vaX3)
-    if submit:
-        teX, teM = transform_roc(teX1, teX2, teX3)
-
-    n_train = len(trY)
-    n_valid = len(vaY)
-    n_batch_train = args.n_batch * max(n_gpu, 1)
-    n_updates_total = (n_train // n_batch_train) * args.n_iter
-
-    dh_model = DoubleHeadModel(args, clf_token, 'multiple_choice', vocab, n_ctx)
-
-    criterion = nn.CrossEntropyLoss(reduce=False)
-    model_opt = OpenAIAdam(dh_model.parameters(),
-                           lr=args.lr,
-                           schedule=args.lr_schedule,
-                           warmup=args.lr_warmup,
-                           t_total=n_updates_total,
-                           b1=args.b1,
-                           b2=args.b2,
-                           e=args.e,
-                           l2=args.l2,
-                           vector_l2=args.vector_l2,
-                           max_grad_norm=args.max_grad_norm)
-    compute_loss_fct = MultipleChoiceLossCompute(criterion,
-                                                 criterion,
-                                                 args.lm_coef,
-                                                 model_opt)
-    load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special)
-
-    dh_model.to(device)
-    dh_model = nn.DataParallel(dh_model)
-
-    n_updates = 0
-    n_epochs = 0
-    if dataset != 'stsb':
-        trYt = trY
-    if submit:
-        path = os.path.join(save_dir, desc, 'best_params')
-        torch.save(dh_model.state_dict(), make_path(path))
-    best_score = 0
-    for i in range(args.n_iter):
-        print("running epoch", i)
-        run_epoch()
-        n_epochs += 1
-        log(save_dir, desc)
-    if submit:
-        path = os.path.join(save_dir, desc, 'best_params')
-        dh_model.load_state_dict(torch.load(path))
-        predict(dataset, args.submission_dir)
-        if args.analysis:
-            rocstories_analysis(data_dir, os.path.join(args.submission_dir, 'ROCStories.tsv'),
-                                os.path.join(log_dir, 'rocstories.jsonl'))

From e5c78c6684b29b0954f326c4f07926987921ba38 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 10 Jan 2019 01:40:00 +0100
Subject: [PATCH 07/82] update readme and few typos

---
 README.md                           | 8 ++++----
 examples/extract_features.py        | 4 ++--
 pytorch_pretrained_bert/modeling.py | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 54291e0779..be0765f4bb 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# PyTorch Pretrained Bert - PyTorch Pretrained OpenAI GPT
+# PyTorch Pretrained Bert (also with PyTorch Pretrained OpenAI GPT)
 
 [![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT)
 
@@ -125,18 +125,18 @@ from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 
 # Tokenized input
-text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
 tokenized_text = tokenizer.tokenize(text)
 
 # Mask a token that we will try to predict back with `BertForMaskedLM`
 masked_index = 6
 tokenized_text[masked_index] = '[MASK]'
-assert tokenized_text == ['who', 'was', 'jim', 'henson', '?', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer']
+assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
 
 # Convert token to vocabulary indices
 indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
 # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
-segments_ids = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
+segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
 
 # Convert inputs to PyTorch tensors
 tokens_tensor = torch.tensor([indexed_tokens])
diff --git a/examples/extract_features.py b/examples/extract_features.py
index 9d05d7905d..593576bdcb 100644
--- a/examples/extract_features.py
+++ b/examples/extract_features.py
@@ -80,10 +80,10 @@ def convert_examples_to_features(examples, seq_length, tokenizer):
         # The convention in BERT is:
         # (a) For sequence pairs:
         #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
         # (b) For single sequences:
         #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids: 0   0   0   0  0     0 0
+        #  type_ids:   0   0   0   0  0     0   0
         #
         # Where "type_ids" are used to indicate whether this is the first
         # sequence or the second sequence. The embedding vectors for `type=0` and
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 6a05873b20..591082f7ce 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -584,7 +584,7 @@ class BertModel(BertPreTrainedModel):
                 to the last attention block of shape [batch_size, sequence_length, hidden_size],
         `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
             classifier pretrained on top of the hidden state associated to the first character of the
-            input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
+            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
 
     Example usage:
     ```python

From 3a9c88377fed787d97f98e1a59ed33ab413e3705 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 15 Jan 2019 12:59:38 +0100
Subject: [PATCH 08/82] adding Transformer XL

---
 ...onvert_transfo_xl_checkpoint_to_pytorch.py |  125 ++
 pytorch_pretrained_bert/modeling_openai.py    |   17 +
 .../modeling_transfo_xl.py                    | 1432 +++++++++++++++++
 .../modeling_transfo_xl_utilities.py          |  314 ++++
 .../tokenization_transfo_xl.py                |  508 ++++++
 5 files changed, 2396 insertions(+)
 create mode 100755 pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
 create mode 100644 pytorch_pretrained_bert/modeling_transfo_xl.py
 create mode 100644 pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
 create mode 100644 pytorch_pretrained_bert/tokenization_transfo_xl.py

diff --git a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
new file mode 100755
index 0000000000..03f71defd6
--- /dev/null
+++ b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -0,0 +1,125 @@
+# coding=utf-8
+# Copyright 2018 The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert OpenAI GPT checkpoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import re
+import argparse
+import tensorflow as tf
+import torch
+import numpy as np
+
+from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel, CONFIG_NAME, WEIGHTS_NAME
+
+
+def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
+                                             transfo_xl_config_file,
+                                             pytorch_dump_folder_path):
+    config_path = os.path.abspath(transfo_xl_config_file)
+    tf_path = os.path.abspath(tf_checkpoint_path)
+
+    print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    # Initialise PyTorch model
+    # Construct model
+    if transfo_xl_config_file == "":
+        config = TransfoXLConfig()
+    else:
+        config = TransfoXLConfig(transfo_xl_config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = TransfoXLModel(config)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m"] for n in name):
+            print("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+
+    # Save pytorch-model
+    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
+    torch.save(model.state_dict(), pytorch_weights_dump_path)
+    print("Save configuration file to {}".format(pytorch_config_dump_path))
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(config.to_json_string())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--tf_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path the TensorFlow checkpoint path.")
+    parser.add_argument("--transfo_xl_config_file",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "The config json file corresponding to the pre-trained BERT model. \n"
+                            "This specifies the model architecture.")
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    args = parser.parse_args()
+    convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path,
+                                     args.transfo_xl_config_file,
+                                     args.pytorch_dump_folder_path)
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 40557c9626..c3cd165e68 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -1,3 +1,20 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OpenAI GPT model."""
+
 import os
 import copy
 import json
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
new file mode 100644
index 0000000000..fccd4616e4
--- /dev/null
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -0,0 +1,1432 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Transformer XL model.
+    Directly adapted from https://github.com/kimiyoung/transformer-xl.
+    In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py
+"""
+
+import os
+import copy
+import json
+import math
+import logging
+import tarfile
+import tempfile
+import shutil
+import collections
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from torch.nn.parameter import Parameter
+
+from .modeling import BertLayerNorm as LayerNorm
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'transfo-xl': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl.tar.gz",
+}
+CONFIG_NAME = 'transfo_xl_config.json'
+WEIGHTS_NAME = 'pytorch_model.bin'
+
+class TransfoXLConfig(object):
+    """Configuration class to store the configuration of a `TransfoXLModel`.
+    """
+    def __init__(self,
+                 vocab_size_or_config_json_file=267735,
+                 cutoffs=[20000, 40000, 200000],
+                 d_model=410,
+                 d_embed=410,
+                 d_head=41,
+                 d_inner=2100,
+                 div_val=1.0,
+                 pre_lnorm=False,
+                 n_layer=16,
+                 n_head=10,
+                 tgt_len=150,
+                 ext_len=0,
+                 mem_len=150,
+                 same_length=False,
+                 attn_type=0,
+                 clamp_len=-1,
+                 sample_softmax=-1,
+                 adaptive=True,
+                 tied=True,
+                 dropout=0.1,
+                 dropatt=0.0,
+                 init="normal",
+                 init_range=0.01,
+                 proj_init_std=0.01,
+                 init_std=0.02):
+        """Constructs TransfoXLConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
+            cutoffs: cutoffs for the adaptive softmax
+            d_model: Dimensionality of the model's hidden states.
+            d_embed: Dimensionality of the embeddings
+            d_head: Dimensionality of the model's heads.
+            div_val: divident value for adapative input and softmax
+            pre_lnorm: apply LayerNorm to the input instead of the output
+            d_inner: Inner dimension in FF
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            tgt_len: number of tokens to predict
+            ext_len: length of the extended context
+            mem_len: length of the retained previous heads
+            same_length: use the same attn length for all tokens
+            attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
+            clamp_len: use the same pos embeddings after clamp_len
+            sample_softmax: number of samples in sampled softmax
+            adaptive: use adaptive softmax
+            tied: tie the word embedding and softmax weights
+            dropout: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            dropatt: The dropout ratio for the attention probabilities.
+            embd_pdrop: The dropout ratio for the embeddings.
+            init: parameter initializer to use
+            init_range: parameters initialized by U(-init_range, init_range).
+            proj_init_std: parameters initialized by N(0, init_std)
+            init_std: parameters initialized by N(0, init_std)
+        """
+        if isinstance(vocab_size_or_config_json_file, str):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.cutoffs = []
+            self.cutoffs.extend(cutoffs)
+            self.tie_projs = [False] + [True] * len(self.cutoffs)
+            self.d_model = d_model
+            self.d_embed = d_embed
+            self.d_head = d_head
+            self.d_inner = d_inner
+            self.div_val = div_val
+            self.pre_lnorm = pre_lnorm
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.tgt_len = tgt_len
+            self.ext_len = ext_len
+            self.mem_len = mem_len
+            self.same_length = same_length
+            self.attn_type = attn_type
+            self.clamp_len = clamp_len
+            self.sample_softmax = sample_softmax
+            self.adaptive = adaptive
+            self.tied = tied
+            self.dropout = dropout
+            self.dropatt = dropatt
+            self.init = init
+            self.init_range = init_range
+            self.proj_init_std = proj_init_std
+            self.init_std = init_std
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             "or the path to a pretrained model config file (str)")
+
+    @property
+    def total_num_embeddings(self):
+        return self.vocab_size + self.n_special + self.n_ctx
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `TransfoXLConfig` from a Python dictionary of parameters."""
+        config = TransfoXLConfig(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `TransfoXLConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+
+class PositionalEmbedding(nn.Module):
+    def __init__(self, demb):
+        super(PositionalEmbedding, self).__init__()
+
+        self.demb = demb
+
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+
+        if bsz is not None:
+            return pos_emb[:,None,:].expand(-1, bsz, -1)
+        else:
+            return pos_emb[:,None,:]
+
+
+class PositionwiseFF(nn.Module):
+    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False):
+        super(PositionwiseFF, self).__init__()
+
+        self.d_model = d_model
+        self.d_inner = d_inner
+        self.dropout = dropout
+
+        self.CoreNet = nn.Sequential(
+            nn.Linear(d_model, d_inner), nn.ReLU(inplace=True),
+            nn.Dropout(dropout),
+            nn.Linear(d_inner, d_model),
+            nn.Dropout(dropout),
+        )
+
+        self.layer_norm = nn.LayerNorm(d_model)
+
+        self.pre_lnorm = pre_lnorm
+
+    def forward(self, inp):
+        if self.pre_lnorm:
+            ##### layer normalization + positionwise feed-forward
+            core_out = self.CoreNet(self.layer_norm(inp))
+
+            ##### residual connection
+            output = core_out + inp
+        else:
+            ##### positionwise feed-forward
+            core_out = self.CoreNet(inp)
+
+            ##### residual connection + layer normalization
+            output = self.layer_norm(inp + core_out)
+
+        return output
+
+class MultiHeadAttn(nn.Module):
+    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, 
+                 pre_lnorm=False):
+        super(MultiHeadAttn, self).__init__()
+
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.dropout = dropout
+
+        self.q_net = nn.Linear(d_model, n_head * d_head, bias=False)
+        self.kv_net = nn.Linear(d_model, 2 * n_head * d_head, bias=False)
+
+        self.drop = nn.Dropout(dropout)
+        self.dropatt = nn.Dropout(dropatt)
+        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
+
+        self.layer_norm = nn.LayerNorm(d_model)
+
+        self.scale = 1 / (d_head ** 0.5)
+
+        self.pre_lnorm = pre_lnorm
+
+    def forward(self, h, attn_mask=None, mems=None):
+        ##### multihead attention
+        # [hlen x bsz x n_head x d_head]
+
+        if mems is not None:
+            c = torch.cat([mems, h], 0)
+        else:
+            c = h
+
+        if self.pre_lnorm:
+            ##### layer normalization
+            c = self.layer_norm(c)
+
+        head_q = self.q_net(h)
+        head_k, head_v = torch.chunk(self.kv_net(c), 2, -1)
+
+        head_q = head_q.view(h.size(0), h.size(1), self.n_head, self.d_head)
+        head_k = head_k.view(c.size(0), c.size(1), self.n_head, self.d_head)
+        head_v = head_v.view(c.size(0), c.size(1), self.n_head, self.d_head)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k))
+        attn_score.mul_(self.scale)
+        if attn_mask is not None and attn_mask.any().item():
+            if attn_mask.dim() == 2:
+                attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
+            elif attn_mask.dim() == 3:
+                attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        # [qlen x klen x bsz x n_head] + [klen x bsz x n_head x d_head] -> [qlen x bsz x n_head x d_head]
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, head_v))
+        attn_vec = attn_vec.contiguous().view(
+            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        ##### linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            ##### residual connection
+            output = h + attn_out
+        else:
+            ##### residual connection + layer normalization
+            output = self.layer_norm(h + attn_out)
+
+        return output
+
+class RelMultiHeadAttn(nn.Module):
+    def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
+                 tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False):
+        super(RelMultiHeadAttn, self).__init__()
+
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.dropout = dropout
+
+        self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False)
+
+        self.drop = nn.Dropout(dropout)
+        self.dropatt = nn.Dropout(dropatt)
+        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
+
+        self.layer_norm = nn.LayerNorm(d_model)
+
+        self.scale = 1 / (d_head ** 0.5)
+
+        self.pre_lnorm = pre_lnorm
+
+    def _parallelogram_mask(self, h, w, left=False):
+        mask = torch.ones((h, w)).byte()
+        m = min(h, w)
+        mask[:m,:m] = torch.triu(mask[:m,:m])
+        mask[-m:,-m:] = torch.tril(mask[-m:,-m:])
+
+        if left:
+            return mask
+        else:
+            return mask.flip(0)
+
+    def _shift(self, x, qlen, klen, mask, left=False):
+        if qlen > 1:
+            zero_pad = torch.zeros((x.size(0), qlen-1, x.size(2), x.size(3)),
+                                    device=x.device, dtype=x.dtype)
+        else:
+            zero_pad = torch.zeros(0, device=x.device, dtype=x.dtype)
+
+        if left:
+            mask = mask.flip(1)
+            x_padded = torch.cat([zero_pad, x], dim=1).expand(qlen, -1, -1, -1)
+        else:
+            x_padded = torch.cat([x, zero_pad], dim=1).expand(qlen, -1, -1, -1)
+
+        x = x_padded.masked_select(mask[:,:,None,None]) \
+                    .view(qlen, klen, x.size(2), x.size(3))
+
+        return x
+
+    def _rel_shift(self, x, zero_triu=False):
+        zero_pad = torch.zeros((x.size(0), 1, *x.size()[2:]),
+                               device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=1)
+
+        x_padded = x_padded.view(x.size(1) + 1, x.size(0), *x.size()[2:])
+
+        x = x_padded[1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(0), x.size(1)))
+            x = x * torch.tril(ones, x.size(1) - x.size(0))[:,:,None,None]
+
+        return x
+
+    def forward(self, w, r, attn_mask=None, mems=None):
+        raise NotImplementedError
+
+class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
+    def __init__(self, *args, **kwargs):
+        super(RelPartialLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
+
+        self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
+
+    def forward(self, w, r, r_w_bias, r_r_bias, attn_mask=None, mems=None):
+        qlen, rlen, bsz = w.size(0), r.size(0), w.size(1)
+
+        if mems is not None:
+            cat = torch.cat([mems, w], 0)
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(cat))
+            else:
+                w_heads = self.qkv_net(cat)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+            w_head_q = w_head_q[-qlen:]
+        else:
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(w))
+            else:
+                w_heads = self.qkv_net(w)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+        klen = w_head_k.size(0)
+
+        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)           # qlen x bsz x n_head x d_head
+
+        r_head_k = r_head_k.view(rlen, self.n_head, self.d_head)                # qlen x n_head x d_head
+
+        #### compute attention score
+        rw_head_q = w_head_q + r_w_bias                                         # qlen x bsz x n_head x d_head
+        AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head
+
+        rr_head_q = w_head_q + r_r_bias
+        BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k))              # qlen x klen x bsz x n_head
+        BD = self._rel_shift(BD)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = AC + BD
+        attn_score.mul_(self.scale)
+
+        #### compute attention probability
+        if attn_mask is not None and attn_mask.any().item():
+            if attn_mask.dim() == 2:
+                attn_score = attn_score.float().masked_fill(
+                    attn_mask[None,:,:,None], -float('inf')).type_as(attn_score)
+            elif attn_mask.dim() == 3:
+                attn_score = attn_score.float().masked_fill(
+                    attn_mask[:,:,:,None], -float('inf')).type_as(attn_score)
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        #### compute attention vector
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
+
+        # [qlen x bsz x n_head x d_head]
+        attn_vec = attn_vec.contiguous().view(
+            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        ##### linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            ##### residual connection
+            output = w + attn_out
+        else:
+            ##### residual connection + layer normalization
+            output = self.layer_norm(w + attn_out)
+
+        return output
+
+class RelLearnableMultiHeadAttn(RelMultiHeadAttn):
+    def __init__(self, *args, **kwargs):
+        super(RelLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
+
+    def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None):
+        # r_emb: [klen, n_head, d_head], used for term B
+        # r_w_bias: [n_head, d_head], used for term C
+        # r_bias: [klen, n_head], used for term D
+
+        qlen, bsz = w.size(0), w.size(1)
+
+        if mems is not None:
+            cat = torch.cat([mems, w], 0)
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(cat))
+            else:
+                w_heads = self.qkv_net(cat)
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+            w_head_q = w_head_q[-qlen:]
+        else:
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(w))
+            else:
+                w_heads = self.qkv_net(w)
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+        klen = w_head_k.size(0)
+
+        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)
+        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)
+        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)
+
+        if klen > r_emb.size(0):
+            r_emb_pad = r_emb[0:1].expand(klen-r_emb.size(0), -1, -1)
+            r_emb = torch.cat([r_emb_pad, r_emb], 0)
+            r_bias_pad = r_bias[0:1].expand(klen-r_bias.size(0), -1)
+            r_bias = torch.cat([r_bias_pad, r_bias], 0)
+        else:
+            r_emb = r_emb[-klen:]
+            r_bias = r_bias[-klen:]
+
+        #### compute attention score
+        rw_head_q = w_head_q + r_w_bias[None]                                   # qlen x bsz x n_head x d_head
+
+        AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head
+        B_ = torch.einsum('ibnd,jnd->ijbn', (w_head_q, r_emb))                  # qlen x klen x bsz x n_head
+        D_ = r_bias[None, :, None]                                              # 1    x klen x 1   x n_head
+        BD = self._rel_shift(B_ + D_)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = AC + BD
+        attn_score.mul_(self.scale)
+
+        #### compute attention probability
+        if attn_mask is not None and attn_mask.any().item():
+            if attn_mask.dim() == 2:
+                attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
+            elif attn_mask.dim() == 3:
+                attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = F.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        #### compute attention vector
+        attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
+
+        # [qlen x bsz x n_head x d_head]
+        attn_vec = attn_vec.contiguous().view(
+            attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        ##### linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            ##### residual connection
+            output = w + attn_out
+        else:
+            ##### residual connection + layer normalization
+            output = self.layer_norm(w + attn_out)
+
+        return output
+
+class DecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs):
+        super(DecoderLayer, self).__init__()
+
+        self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+                                     pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, dec_attn_mask=None, mems=None):
+
+        output = self.dec_attn(dec_inp, attn_mask=dec_attn_mask,
+                               mems=mems)
+        output = self.pos_ff(output)
+
+        return output
+
+class RelLearnableDecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout,
+                 **kwargs):
+        super(RelLearnableDecoderLayer, self).__init__()
+
+        self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout,
+                                         **kwargs)
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+                                     pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None):
+
+        output = self.dec_attn(dec_inp, r_emb, r_w_bias, r_bias,
+                               attn_mask=dec_attn_mask,
+                               mems=mems)
+        output = self.pos_ff(output)
+
+        return output
+
+class RelPartialLearnableDecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout,
+                 **kwargs):
+        super(RelPartialLearnableDecoderLayer, self).__init__()
+
+        self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model,
+                            d_head, dropout, **kwargs)
+        self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
+                                     pre_lnorm=kwargs.get('pre_lnorm'))
+
+    def forward(self, dec_inp, r, r_w_bias, r_r_bias, dec_attn_mask=None, mems=None):
+
+        output = self.dec_attn(dec_inp, r, r_w_bias, r_r_bias,
+                               attn_mask=dec_attn_mask,
+                               mems=mems)
+        output = self.pos_ff(output)
+
+        return output
+
+
+class AdaptiveEmbedding(nn.Module):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, 
+                 sample_softmax=False):
+        super(AdaptiveEmbedding, self).__init__()
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+
+        self.cutoffs = cutoffs + [n_token]
+        self.div_val = div_val
+        self.d_proj = d_proj
+
+        self.emb_scale = d_proj ** 0.5
+
+        self.cutoff_ends = [0] + self.cutoffs
+
+        self.emb_layers = nn.ModuleList()
+        self.emb_projs = nn.ParameterList()
+        if div_val == 1:
+            self.emb_layers.append(
+                nn.Embedding(n_token, d_embed, sparse=sample_softmax>0)
+            )
+            if d_proj != d_embed:
+                self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_embed)))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                d_emb_i = d_embed // (div_val ** i)
+                self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i))
+                self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_emb_i)))
+
+    def forward(self, inp):
+        if self.div_val == 1:
+            embed = self.emb_layers[0](inp)
+            if self.d_proj != self.d_embed:
+                embed  = F.linear(embed, self.emb_projs[0])
+        else:
+            param = next(self.parameters())
+            inp_flat = inp.view(-1)
+            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], 
+                dtype=param.dtype, device=param.device)
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+
+                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
+                indices_i = mask_i.nonzero().squeeze()
+
+                if indices_i.numel() == 0:
+                    continue
+
+                inp_i = inp_flat.index_select(0, indices_i) - l_idx
+                emb_i = self.emb_layers[i](inp_i)
+                emb_i = F.linear(emb_i, self.emb_projs[i])
+
+                emb_flat.index_copy_(0, indices_i, emb_i)
+
+            embed = emb_flat.view(*inp.size(), self.d_proj)
+
+        embed.mul_(self.emb_scale)
+
+        return embed
+
+class MemTransformerLM(nn.Module):
+    def __init__(self, n_token, n_layer, n_head, d_model, d_head, d_inner,
+                 dropout, dropatt, tie_weight=True, d_embed=None, 
+                 div_val=1, tie_projs=[False], pre_lnorm=False,
+                 tgt_len=None, ext_len=None, mem_len=None, 
+                 cutoffs=[], adapt_inp=False,
+                 same_length=False, attn_type=0, clamp_len=-1, 
+                 sample_softmax=-1):
+        super(MemTransformerLM, self).__init__()
+        self.n_token = n_token
+
+        d_embed = d_model if d_embed is None else d_embed
+        self.d_embed = d_embed
+        self.d_model = d_model
+        self.n_head = n_head
+        self.d_head = d_head
+
+        self.word_emb = AdaptiveEmbedding(n_token, d_embed, d_model, cutoffs, 
+                                          div_val=div_val)
+
+        self.drop = nn.Dropout(dropout)
+
+        self.n_layer = n_layer
+
+        self.tgt_len = tgt_len
+        self.mem_len = mem_len
+        self.ext_len = ext_len
+        self.max_klen = tgt_len + ext_len + mem_len
+
+        self.attn_type = attn_type
+
+        self.layers = nn.ModuleList()
+        if attn_type == 0: # the default attention
+            for i in range(n_layer):
+                self.layers.append(
+                    RelPartialLearnableDecoderLayer(
+                        n_head, d_model, d_head, d_inner, dropout,
+                        tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len,
+                        dropatt=dropatt, pre_lnorm=pre_lnorm)
+                )
+        elif attn_type == 1: # learnable embeddings
+            for i in range(n_layer):
+                self.layers.append(
+                    RelLearnableDecoderLayer(
+                        n_head, d_model, d_head, d_inner, dropout,
+                        tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len,
+                        dropatt=dropatt, pre_lnorm=pre_lnorm)
+                )
+        elif attn_type in [2, 3]: # absolute embeddings
+            for i in range(n_layer):
+                self.layers.append(
+                    DecoderLayer(
+                        n_head, d_model, d_head, d_inner, dropout,
+                        dropatt=dropatt, pre_lnorm=pre_lnorm)
+                )
+
+        self.sample_softmax = sample_softmax
+        # use sampled softmax
+        if sample_softmax > 0:
+            self.out_layer = nn.Linear(d_model, n_token)
+            if tie_weight:
+                self.out_layer.weight = self.word_emb.weight
+            self.tie_weight = tie_weight
+            self.sampler = LogUniformSampler(n_token, sample_softmax)
+
+        # use adaptive softmax (including standard softmax)
+        else:
+            self.crit = ProjectedAdaptiveLogSoftmax(n_token, d_embed, d_model, 
+                                                    cutoffs, div_val=div_val)
+
+            if tie_weight:
+                for i in range(len(self.crit.out_layers)):
+                    self.crit.out_layers[i].weight = self.word_emb.emb_layers[i].weight
+
+            if tie_projs:
+                for i, tie_proj in enumerate(tie_projs):
+                    if tie_proj and div_val == 1 and d_model != d_embed:
+                        self.crit.out_projs[i] = self.word_emb.emb_projs[0]
+                    elif tie_proj and div_val != 1:
+                        self.crit.out_projs[i] = self.word_emb.emb_projs[i]
+
+        self.same_length = same_length
+        self.clamp_len = clamp_len
+
+        self._create_params()
+
+    def backward_compatible(self):
+        self.sample_softmax = -1
+
+    def _create_params(self):
+        if self.attn_type == 0: # default attention
+            self.pos_emb = PositionalEmbedding(self.d_model)
+            self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+            self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+        elif self.attn_type == 1: # learnable
+            self.r_emb = nn.Parameter(torch.Tensor(
+                    self.n_layer, self.max_klen, self.n_head, self.d_head))
+            self.r_w_bias = nn.Parameter(torch.Tensor(
+                    self.n_layer, self.n_head, self.d_head))
+            self.r_bias = nn.Parameter(torch.Tensor(
+                    self.n_layer, self.max_klen, self.n_head))
+        elif self.attn_type == 2: # absolute standard
+            self.pos_emb = PositionalEmbedding(self.d_model)
+        elif self.attn_type == 3: # absolute deeper SA
+            self.r_emb = nn.Parameter(torch.Tensor(
+                    self.n_layer, self.max_klen, self.n_head, self.d_head))
+
+    def reset_length(self, tgt_len, ext_len, mem_len):
+        self.tgt_len = tgt_len
+        self.mem_len = mem_len
+        self.ext_len = ext_len
+
+    def init_mems(self):
+        if self.mem_len > 0:
+            mems = []
+            param = next(self.parameters())
+            for i in range(self.n_layer+1):
+                empty = torch.empty(0, dtype=param.dtype, device=param.device)
+                mems.append(empty)
+
+            return mems
+        else:
+            return None
+
+    def _update_mems(self, hids, mems, qlen, mlen):
+        # does not deal with None
+        if mems is None: return None
+
+        # mems is not None
+        assert len(hids) == len(mems), 'len(hids) != len(mems)'
+
+        # There are `mlen + qlen` steps that can be cached into mems
+        # For the next step, the last `ext_len` of the `qlen` tokens
+        # will be used as the extended context. Hence, we only cache
+        # the tokens from `mlen + qlen - self.ext_len - self.mem_len`
+        # to `mlen + qlen - self.ext_len`.
+        with torch.no_grad():
+            new_mems = []
+            end_idx = mlen + max(0, qlen - 0 - self.ext_len)
+            beg_idx = max(0, end_idx - self.mem_len)
+            for i in range(len(hids)):
+
+                cat = torch.cat([mems[i], hids[i]], dim=0)
+                new_mems.append(cat[beg_idx:end_idx].detach())
+
+        return new_mems
+
+    def _forward(self, dec_inp, mems=None):
+        qlen, bsz = dec_inp.size()
+
+        word_emb = self.word_emb(dec_inp)
+
+        mlen = mems[0].size(0) if mems is not None else 0
+        klen = mlen + qlen
+        if self.same_length:
+            all_ones = word_emb.new_ones(qlen, klen)
+            mask_len = klen - self.mem_len
+            if mask_len > 0:
+                mask_shift_len = qlen - mask_len
+            else:
+                mask_shift_len = qlen
+            dec_attn_mask = (torch.triu(all_ones, 1+mlen)
+                    + torch.tril(all_ones, -mask_shift_len)).byte()[:, :, None] # -1
+        else:
+            dec_attn_mask = torch.triu(
+                word_emb.new_ones(qlen, klen), diagonal=1+mlen).byte()[:,:,None]
+
+        hids = []
+        if self.attn_type == 0: # default
+            pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, 
+                                   dtype=word_emb.dtype)
+            if self.clamp_len > 0:
+                pos_seq.clamp_(max=self.clamp_len)
+            pos_emb = self.pos_emb(pos_seq)
+
+            core_out = self.drop(word_emb)
+            pos_emb = self.drop(pos_emb)
+
+            hids.append(core_out)
+            for i, layer in enumerate(self.layers):
+                mems_i = None if mems is None else mems[i]
+                core_out = layer(core_out, pos_emb, self.r_w_bias,
+                        self.r_r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i)
+                hids.append(core_out)
+        elif self.attn_type == 1: # learnable
+            core_out = self.drop(word_emb)
+            hids.append(core_out)
+            for i, layer in enumerate(self.layers):
+                if self.clamp_len > 0:
+                    r_emb = self.r_emb[i][-self.clamp_len :]
+                    r_bias = self.r_bias[i][-self.clamp_len :]
+                else:
+                    r_emb, r_bias = self.r_emb[i], self.r_bias[i]
+
+                mems_i = None if mems is None else mems[i]
+                core_out = layer(core_out, r_emb, self.r_w_bias[i],
+                        r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i)
+                hids.append(core_out)
+        elif self.attn_type == 2: # absolute
+            pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device,
+                                   dtype=word_emb.dtype)
+            if self.clamp_len > 0:
+                pos_seq.clamp_(max=self.clamp_len)
+            pos_emb = self.pos_emb(pos_seq)
+
+            core_out = self.drop(word_emb + pos_emb[-qlen:])
+
+            hids.append(core_out)
+            for i, layer in enumerate(self.layers):
+                mems_i = None if mems is None else mems[i]
+                if mems_i is not None and i == 0:
+                    mems_i += pos_emb[:mlen]
+                core_out = layer(core_out, dec_attn_mask=dec_attn_mask,
+                                 mems=mems_i)
+                hids.append(core_out)
+        elif self.attn_type == 3:
+            core_out = self.drop(word_emb)
+
+            hids.append(core_out)
+            for i, layer in enumerate(self.layers):
+                mems_i = None if mems is None else mems[i]
+                if mems_i is not None and mlen > 0:
+                    cur_emb = self.r_emb[i][:-qlen]
+                    cur_size = cur_emb.size(0)
+                    if cur_size < mlen:
+                        cur_emb_pad = cur_emb[0:1].expand(mlen-cur_size, -1, -1)
+                        cur_emb = torch.cat([cur_emb_pad, cur_emb], 0)
+                    else:
+                        cur_emb = cur_emb[-mlen:]
+                    mems_i += cur_emb.view(mlen, 1, -1)
+                core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1)
+
+                core_out = layer(core_out, dec_attn_mask=dec_attn_mask,
+                                 mems=mems_i)
+                hids.append(core_out)
+
+        core_out = self.drop(core_out)
+
+        new_mems = self._update_mems(hids, mems, mlen, qlen)
+
+        return core_out, new_mems
+
+    def forward(self, data, target, *mems):
+        # nn.DataParallel does not allow size(0) tensors to be broadcasted.
+        # So, have to initialize size(0) mems inside the model forward.
+        # Moreover, have to return new_mems to allow nn.DataParallel to piece
+        # them together.
+        if not mems: mems = self.init_mems()
+
+        tgt_len = target.size(0)
+        hidden, new_mems = self._forward(data, mems=mems)
+
+        pred_hid = hidden[-tgt_len:]
+        if self.sample_softmax > 0 and self.training:
+            assert self.tie_weight
+            logit = sample_logits(self.word_emb,
+                self.out_layer.bias, target, pred_hid, self.sampler)
+            loss = -F.log_softmax(logit, -1)[:, :, 0]
+        else:
+            loss = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target.view(-1))
+            loss = loss.view(tgt_len, -1)
+
+        if new_mems is None:
+            return [loss]
+        else:
+            return [loss] + new_mems
+
+
+class TransfoXLPreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TransfoXLPreTrainedModel, self).__init__()
+        if not isinstance(config, TransfoXLConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `TransfoXLConfig`. "
+                "To create a model from a pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        self.config = config
+
+    def init_weight(weight):
+        if self.config.init == 'uniform':
+            nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
+        elif self.config.init == 'normal':
+            nn.init.normal_(weight, 0.0, self.config.init_std)
+
+    def init_bias(bias):
+        nn.init.constant_(bias, 0.0)
+
+    def init_weights(self, m):
+        """ Initialize the weights.
+        """
+        classname = m.__class__.__name__
+        if classname.find('Linear') != -1:
+            if hasattr(m, 'weight') and m.weight is not None:
+                self.init_weight(m.weight)
+            if hasattr(m, 'bias') and m.bias is not None:
+                self.init_bias(m.bias)
+        elif classname.find('AdaptiveEmbedding') != -1:
+            if hasattr(m, 'emb_projs'):
+                for i in range(len(m.emb_projs)):
+                    if m.emb_projs[i] is not None:
+                        nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std)
+        elif classname.find('Embedding') != -1:
+            if hasattr(m, 'weight'):
+                self.init_weight(m.weight)
+        elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
+            if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
+                self.init_weight(m.cluster_weight)
+            if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
+                self.init_bias(m.cluster_bias)
+            if hasattr(m, 'out_projs'):
+                for i in range(len(m.out_projs)):
+                    if m.out_projs[i] is not None:
+                        nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std)
+        elif classname.find('LayerNorm') != -1:
+            if hasattr(m, 'weight'):
+                nn.init.normal_(m.weight, 1.0, self.config.init_std)
+            if hasattr(m, 'bias') and m.bias is not None:
+                self.init_bias(m.bias)
+        elif classname.find('TransformerLM') != -1:
+            if hasattr(m, 'r_emb'):
+                self.init_weight(m.r_emb)
+            if hasattr(m, 'r_w_bias'):
+                self.init_weight(m.r_w_bias)
+            if hasattr(m, 'r_r_bias'):
+                self.init_weight(m.r_r_bias)
+            if hasattr(m, 'r_bias'):
+                self.init_bias(m.r_bias)
+
+    def set_num_special_tokens(self, num_special_tokens):
+        pass
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name, num_special_tokens=0, state_dict=None, cache_dir=None,
+                        *inputs, **kwargs):
+        """
+        Instantiate a TransfoXLPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `transfo-xl`
+                - a path or url to a pretrained model archive containing:
+                    . `transfo_xl_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """
+        if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
+        else:
+            archive_file = pretrained_model_name
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except FileNotFoundError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                "associated to this path or url.".format(
+                    pretrained_model_name,
+                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                    archive_file))
+            return None
+        if resolved_archive_file == archive_file:
+            logger.info("loading archive file {}".format(archive_file))
+        else:
+            logger.info("loading archive file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+        tempdir = None
+        if os.path.isdir(resolved_archive_file):
+            serialization_dir = resolved_archive_file
+        else:
+            # Extract archive to temp dir
+            tempdir = tempfile.mkdtemp()
+            logger.info("extracting archive file {} to temp dir {}".format(
+                resolved_archive_file, tempdir))
+            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+                archive.extractall(tempdir)
+            serialization_dir = tempdir
+        # Load config
+        config_file = os.path.join(serialization_dir, CONFIG_NAME)
+        config = TransfoXLConfig.from_json_file(config_file)
+        logger.info("Model config {}".format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None:
+            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
+            state_dict = torch.load(weights_path)
+
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+        load(model.transformer if hasattr(model, 'transformer') else model, prefix='')
+        if len(missing_keys) > 0:
+            logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            logger.info("Weights from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               model.__class__.__name__, "\n\t".join(error_msgs)))
+        # Add additional embeddings for special tokens if needed
+        if num_special_tokens != config.n_special:
+            model.set_num_special_tokens(num_special_tokens)
+        if tempdir:
+            # Clean up temp dir
+            shutil.rmtree(tempdir)
+        return model
+
+
+
+
+
+
+###################
+
+
+
+
+class TransfoXLLMHead(nn.Module):
+    """ Language Model Head for the transformer """
+
+    def __init__(self, model_embeddings_weights, config):
+        super(TransfoXLLMHead, self).__init__()
+        self.n_embd = config.n_embd
+        self.set_embeddings_weights(model_embeddings_weights)
+
+    def set_embeddings_weights(self, model_embeddings_weights):
+        embed_shape = model_embeddings_weights.shape
+        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
+        self.decoder.weight = model_embeddings_weights # Tied weights
+
+    def forward(self, hidden_state):
+        # Truncated Language modeling logits (we remove the last token)
+        # h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
+        lm_logits = self.decoder(hidden_state)
+        return lm_logits
+
+
+class TransfoXLMultipleChoiceHead(nn.Module):
+    """ Classifier Head for the transformer """
+
+    def __init__(self, config):
+        super(TransfoXLMultipleChoiceHead, self).__init__()
+        self.n_embd = config.n_embd
+        # self.multiple_choice_token = multiple_choice_token
+        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
+        self.linear = nn.Linear(config.n_embd, 1)
+
+        nn.init.normal_(self.linear.weight, std = 0.02)
+        nn.init.normal_(self.linear.bias, 0)
+
+    def forward(self, hidden_states, multiple_choice_token_mask):
+        # Classification logits
+        # hidden_states = hidden_states.view(-1, self.n_embd)
+        # multiple_choice_token_mask = multiple_choice_token_mask.view(-1, 1).expand_as(hidden_states)
+        multiple_choice_h = hidden_states * multiple_choice_token_mask.unsqueeze(-1)
+        multiple_choice_h = multiple_choice_h.sum(dim=-2)
+        # flat = x[..., 0].contiguous().view(-1)
+        # multiple_choice_h = multiple_choice_h[flat == self.multiple_choice_token, :]
+        # multiple_choice_h = multiple_choice_h.view(-1, x.size(1), self.n_embd, 1)
+        # # This double transposition is there to replicate the behavior
+        # # of the noise_shape argument in the tensorflow
+        # # implementation.  For more details, see
+        # # https://github.com/huggingface/pytorch-openai-transformer-lm/issues/11
+        # multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
+        # multiple_choice_h = multiple_choice_h.contiguous().view(-1, self.n_embd)
+        multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
+        return multiple_choice_logits
+
+
+class TransfoXLModel(TransfoXLPreTrainedModel):
+    """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
+
+    The main implementation difference between BERT and the OpenAI is the use, in OpenAI GPT, of a single embedding matrix
+    to store the word, special ([SEP], [CLS]...) and position embeddings.
+    The embeddings are ordered as follow in the word embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1,                  ______________________
+         config.vocab_size + config.n_special,
+         ...                                                        -> position embeddings
+         total_num_embeddings - 1]                                  ______________________
+
+    where total_num_embeddings can be obtained as config.total_num_embeddings and is:
+        total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
+    You should use the associate indices to index the embeddings.
+
+    The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    Params:
+        config: a TransfoXLConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_ctx - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third embedding (the previous two being the word and position embeddings)
+            to each token in the sentence.
+
+    Outputs:
+        `hidden_states`: the encoded-hidden-states at the top of the model
+            as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
+            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+    config = modeling_transfo_xl.TransfoXLConfig()
+
+    model = modeling_transfo_xl.TransfoXLModel(config)
+    hidden_states = model(input_ids)
+    ```
+    """
+    def __init__(self, config):
+        super(TransfoXLModel, self).__init__(config)
+        total_embeddings_size = config.vocab_size + config.n_special + config.n_ctx
+        self.embed = nn.Embedding(total_embeddings_size, config.n_embd)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        block = Block(config.n_ctx, config, scale=True)
+        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
+
+        self.apply(self.init_weights)
+        # nn.init.normal_(self.embed.weight, std=0.02)
+
+    def set_num_special_tokens(self, num_special_tokens):
+        " Update input embeddings with new embedding matrice "
+        # Update config
+        self.config.n_special = num_special_tokens
+        # # Build new embeddings and initialize
+        old_embed = self.embed
+        self.embed = nn.Embedding(self.config.total_num_embeddings, self.config.n_embd)
+        # Initialize all new embeddings (in particular the special tokens)
+        self.init_weights(self.embed)
+        # Copy word and positional embeddings from the previous weights
+        self.embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
+        self.embed.weight.data[-self.config.n_ctx:, :] = old_embed.weight.data[-self.config.n_ctx:, :]
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None):
+        if position_ids is None:
+            start = self.config.vocab_size + self.config.n_special
+            end = start + input_ids.size(-1)
+            position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        position_ids = position_ids.view(-1, position_ids.size(-1))
+
+        inputs_embeds = self.embed(input_ids)
+        position_embeds = self.embed(position_ids)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
+            token_type_embeds = self.embed(token_type_ids)
+        else:
+            token_type_embeds = 0
+        # Add the position information to the input embeddings
+        # h = e.sum(dim=2)
+        hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        for block in self.h:
+            hidden_states = block(hidden_states)
+        return hidden_states.view(*input_shape, hidden_states.size(-1))
+
+class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
+    """OpenAI GPT model with a Language Modeling head ("Improving Language Understanding by Generative Pre-Training").
+
+    There are two main implementation differences between BERT and the OpenAI GPT:
+        - the use of an LM loss in OpenAI GPT which means the Transformer is trained to predict the NEXT token for each input token
+            vs. predict the SAME token for BERT (i.e. you need to shift your labels to the right)
+        - the use, in OpenAI GPT, of a single embedding matrix to store the word, special ([SEP], [CLS]...) and position embeddings.
+    The embeddings are ordered as follow in the word embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1,                  ______________________
+         config.vocab_size + config.n_special,
+         ...                                                        -> position embeddings
+         total_num_embeddings - 1]                                  ______________________
+
+    where total_num_embeddings can be obtained as config.total_num_embeddings and is:
+        total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
+    You should use these indices to index the word, special and position embeddings.
+
+    The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    Params:
+        config: a TransfoXLConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_ctx - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third embedding (the previous two being the word and position embeddings)
+            to each token in the sentence.
+        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+
+    Outputs:
+        if `lm_labels` is not `None`:
+            Outputs the language modeling loss.
+        else:
+            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_num_embeddings]
+                (or more generally [d_1, ..., d_n, total_num_embeddings] were d_1 ... d_n are the dimension of input_ids)
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+    config = modeling_transfo_xl.TransfoXLConfig()
+
+    model = modeling_transfo_xl.TransfoXLLMHeadModel(config)
+    lm_logits = model(input_ids)
+    ```
+    """
+    def __init__(self, config):
+        super(TransfoXLLMHeadModel, self).__init__(config)
+        self.transformer = TransfoXLModel(config)
+        self.lm_head = TransfoXLLMHead(self.transformer.embed.weight, config)
+        self.apply(self.init_weights)
+
+    def set_num_special_tokens(self, num_special_tokens):
+        " Update input and output embeddings with new embedding matrice "
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None):
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
+        lm_logits = self.lm_head(hidden_states)
+        if lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
+            return loss
+        return lm_logits
+
+class TransfoXLDoubleHeadsModel(TransfoXLPreTrainedModel):
+    """OpenAI GPT model with a Language Modeling and a Multiple Choice heads ("Improving Language Understanding by Generative Pre-Training").
+
+    There are two main implementation differences between BERT and the OpenAI GPT:
+        - the use of an LM loss in OpenAI GPT which means the Transformer is trained to predict the NEXT token for each input token
+            vs. predict the SAME token for BERT (i.e. you need to shift your labels to the right)
+        - the use, in OpenAI GPT, of a single embedding matrix to store the word, special ([SEP], [CLS]...) and position embeddings.
+    The embeddings are ordered as follow in the word embeddings matrice:
+        [0,                                                         ----------------------
+         ...                                                        -> word embeddings
+         config.vocab_size - 1,                                     ______________________
+         config.vocab_size,
+         ...                                                        -> special embeddings
+         config.vocab_size + config.n_special - 1,                  ______________________
+         config.vocab_size + config.n_special,
+         ...                                                        -> position embeddings
+         total_num_embeddings - 1]                                  ______________________
+
+    where total_num_embeddings can be obtained as config.total_num_embeddings and is:
+        total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
+    You should use these indices to index the word, special and position embeddings.
+
+    The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    Params:
+        config: a TransfoXLConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with the word BPE token indices selected in the range [0, config.vocab_size[
+        `multiple_choice_token_mask`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with a value of 1 were the last hidden state is (usually the [CLS] token) and 0 otherwise.
+        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+            with the position indices (selected in the range [config.vocab_size + config.n_special,
+            config.vocab_size + config.n_special + config.n_ctx - 1[.
+        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+            You can use it to add a third embedding (the previous two being the word and position embeddings)
+            to each token in the sentence.
+        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with indices selected in [-1, 0, ..., total_num_embeddings]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., total_num_embeddings]
+        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_choices].
+
+    Outputs:
+        if `lm_labels` and `multiple_choice_labels` are not `None`:
+            Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
+        else: a tuple with
+            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_num_embeddings]
+            `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    multiple_choice_token_mask = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling_transfo_xl.TransfoXLConfig()
+
+    model = modeling_transfo_xl.TransfoXLLMHeadModel(config)
+    lm_logits, multiple_choice_logits = model(input_ids, multiple_choice_token_mask)
+    ```
+    """
+    def __init__(self, config):
+        super(TransfoXLDoubleHeadsModel, self).__init__(config)
+        self.transformer = TransfoXLModel(config)
+        self.lm_head = TransfoXLLMHead(self.transformer.embed.weight, config)
+        self.multiple_choice_head = TransfoXLMultipleChoiceHead(config)
+        self.apply(self.init_weights)
+
+    def set_num_special_tokens(self, num_special_tokens):
+        " Update input and output embeddings with new embedding matrice "
+        self.transformer.set_num_special_tokens(num_special_tokens)
+        self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
+
+    def forward(self, input_ids, multiple_choice_token_mask, position_ids=None, token_type_ids=None,
+                lm_labels=None, multiple_choice_labels=None):
+        hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
+        lm_logits = self.lm_head(hidden_states)
+        multiple_choice_logits = self.multiple_choice_head(hidden_states, multiple_choice_token_mask)
+        losses = []
+        if lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
+        if multiple_choice_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            losses.append(loss_fct(multiple_choice_logits, multiple_choice_labels.view(-1)))
+        if losses:
+            return losses
+        return lm_logits, multiple_choice_logits
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
new file mode 100644
index 0000000000..a9ead38faf
--- /dev/null
+++ b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
@@ -0,0 +1,314 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Utilities for PyTorch Transformer XL model.
+    Directly adapted from https://github.com/kimiyoung/transformer-xl.
+"""
+
+from collections import defaultdict
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
+# CUDA_MINOR = int(torch.version.cuda.split('.')[1])
+
+class ProjectedAdaptiveLogSoftmax(nn.Module):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
+                 keep_order=False):
+        super(ProjectedAdaptiveLogSoftmax, self).__init__()
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+        self.d_proj = d_proj
+
+        self.cutoffs = cutoffs + [n_token]
+        self.cutoff_ends = [0] + self.cutoffs
+        self.div_val = div_val
+
+        self.shortlist_size = self.cutoffs[0]
+        self.n_clusters = len(self.cutoffs) - 1
+        self.head_size = self.shortlist_size + self.n_clusters
+
+        if self.n_clusters > 0:
+            self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed))
+            self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
+
+        self.out_layers = nn.ModuleList()
+        self.out_projs = nn.ParameterList()
+
+        if div_val == 1:
+            for i in range(len(self.cutoffs)):
+                if d_proj != d_embed:
+                    self.out_projs.append(
+                        nn.Parameter(torch.Tensor(d_proj, d_embed))
+                    )
+                else:
+                    self.out_projs.append(None)
+
+            self.out_layers.append(nn.Linear(d_embed, n_token))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
+                d_emb_i = d_embed // (div_val ** i)
+
+                self.out_projs.append(
+                    nn.Parameter(torch.Tensor(d_proj, d_emb_i))
+                )
+
+                self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx))
+
+        self.keep_order = keep_order
+
+    def _compute_logit(self, hidden, weight, bias, proj):
+        if proj is None:
+            logit = F.linear(hidden, weight, bias=bias)
+        else:
+            # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1:
+            proj_hid = F.linear(hidden, proj.t().contiguous())
+            logit = F.linear(proj_hid, weight, bias=bias)
+            # else:
+            #     logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))
+            #     if bias is not None:
+            #         logit = logit + bias
+
+        return logit
+
+    def forward(self, hidden, target, keep_order=False):
+        '''
+            hidden :: [len*bsz x d_proj]
+            target :: [len*bsz]
+        '''
+
+        if hidden.size(0) != target.size(0):
+            raise RuntimeError('Input and target should have the same size '
+                               'in the batch dimension.')
+
+        if self.n_clusters == 0:
+            logit = self._compute_logit(hidden, self.out_layers[0].weight,
+                                        self.out_layers[0].bias, self.out_projs[0])
+            nll = -F.log_softmax(logit, dim=-1) \
+                    .gather(1, target.unsqueeze(1)).squeeze(1)
+        else:
+            # construct weights and biases
+            weights, biases = [], []
+            for i in range(len(self.cutoffs)):
+                if self.div_val == 1:
+                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
+                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
+                else:
+                    weight_i = self.out_layers[i].weight
+                    bias_i = self.out_layers[i].bias
+
+                if i == 0:
+                    weight_i = torch.cat(
+                        [weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat(
+                        [bias_i, self.cluster_bias], dim=0)
+
+                weights.append(weight_i)
+                biases.append(bias_i)
+
+            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
+
+            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
+            head_logprob = F.log_softmax(head_logit, dim=1)
+
+            nll = torch.zeros_like(target,
+                    dtype=hidden.dtype, device=hidden.device)
+
+            offset = 0
+            cutoff_values = [0] + self.cutoffs
+            for i in range(len(cutoff_values) - 1):
+                l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]
+
+                mask_i = (target >= l_idx) & (target < r_idx)
+                indices_i = mask_i.nonzero().squeeze()
+
+                if indices_i.numel() == 0:
+                    continue
+
+                target_i = target.index_select(0, indices_i) - l_idx
+                head_logprob_i = head_logprob.index_select(0, indices_i)
+
+                if i == 0:
+                    logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1)
+                else:
+                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
+
+                    hidden_i = hidden.index_select(0, indices_i)
+
+                    tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
+                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
+
+                    logprob_i = head_logprob_i[:, -i] \
+                              + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1)
+
+                if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
+                    nll.index_copy_(0, indices_i, -logprob_i)
+                else:
+                    nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
+
+                offset += logprob_i.size(0)
+
+        return nll
+
+class LogUniformSampler(object):
+    def __init__(self, range_max, n_sample):
+        """
+        Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
+            `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
+
+        expected count can be approximated by 1 - (1 - p)^n
+        and we use a numerically stable version -expm1(num_tries * log1p(-p))
+
+        Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run
+        """
+        with torch.no_grad():
+            self.range_max = range_max
+            log_indices = torch.arange(1., range_max+2., 1.).log_()
+            self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
+            # print('P', self.dist.numpy().tolist()[-30:])
+
+            self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()
+
+        self.n_sample = n_sample
+
+    def sample(self, labels):
+        """
+            labels: [b1, b2]
+        Return
+            true_log_probs: [b1, b2]
+            samp_log_probs: [n_sample]
+            neg_samples: [n_sample]
+        """
+
+        # neg_samples = torch.empty(0).long()
+        n_sample = self.n_sample
+        n_tries = 2 * n_sample
+
+        with torch.no_grad():
+            neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique()
+            device = labels.device
+            neg_samples = neg_samples.to(device)
+            true_log_probs = self.log_q[labels].to(device)
+            samp_log_probs = self.log_q[neg_samples].to(device)
+            return true_log_probs, samp_log_probs, neg_samples
+
+def sample_logits(embedding, bias, labels, inputs, sampler):
+    """
+        embedding: an nn.Embedding layer
+        bias: [n_vocab]
+        labels: [b1, b2]
+        inputs: [b1, b2, n_emb]
+        sampler: you may use a LogUniformSampler
+    Return
+        logits: [b1, b2, 1 + n_sample]
+    """
+    true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels)
+    n_sample = neg_samples.size(0)
+    b1, b2 = labels.size(0), labels.size(1)
+    all_ids = torch.cat([labels.view(-1), neg_samples])
+    all_w = embedding(all_ids)
+    true_w = all_w[: -n_sample].view(b1, b2, -1)
+    sample_w = all_w[- n_sample:].view(n_sample, -1)
+
+    all_b = bias[all_ids]
+    true_b = all_b[: -n_sample].view(b1, b2)
+    sample_b = all_b[- n_sample:]
+
+    hit = (labels[:, :, None] == neg_samples).detach()
+
+    true_logits = torch.einsum('ijk,ijk->ij',
+        [true_w, inputs]) + true_b - true_log_probs
+    sample_logits = torch.einsum('lk,ijk->ijl',
+        [sample_w, inputs]) + sample_b - samp_log_probs
+    sample_logits.masked_fill_(hit, -1e30)
+    logits = torch.cat([true_logits[:, :, None], sample_logits], -1)
+
+    return logits
+
+
+# class LogUniformSampler(object):
+#     def __init__(self, range_max, unique=False):
+#         """
+#         Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
+#             `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
+#         """
+#         self.range_max = range_max
+#         log_indices = torch.arange(1., range_max+2., 1.).log_()
+#         self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
+
+#         self.unique = unique
+
+#         if self.unique:
+#             self.exclude_mask = torch.ByteTensor(range_max).fill_(0)
+
+#     def sample(self, n_sample, labels):
+#         pos_sample, new_labels = labels.unique(return_inverse=True)
+#         n_pos_sample = pos_sample.size(0)
+#         n_neg_sample = n_sample - n_pos_sample
+
+#         if self.unique:
+#             self.exclude_mask.index_fill_(0, pos_sample, 1)
+#             sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0)
+#             self.exclude_mask.index_fill_(0, pos_sample, 0)
+#         else:
+#             sample_dist = self.dist
+
+#         neg_sample = torch.multinomial(sample_dist, n_neg_sample)
+
+#         sample = torch.cat([pos_sample, neg_sample])
+#         sample_prob = self.dist[sample]
+
+#         return new_labels, sample, sample_prob
+
+
+if __name__ == '__main__':
+    S, B = 3, 4
+    n_vocab = 10000
+    n_sample = 5
+    H = 32
+
+    labels = torch.LongTensor(S, B).random_(0, n_vocab)
+
+    # sampler = LogUniformSampler(n_vocab, unique=False)
+    # new_labels, sample, sample_prob = sampler.sample(n_sample, labels)
+
+    sampler = LogUniformSampler(n_vocab, unique=True)
+    # true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels)
+
+    # print('true_probs', true_probs.numpy().tolist())
+    # print('samp_probs', samp_probs.numpy().tolist())
+    # print('neg_samples', neg_samples.numpy().tolist())
+
+    # print('sum', torch.sum(sampler.dist).item())
+
+    # assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item()
+
+    embedding = nn.Embedding(n_vocab, H)
+    bias = torch.zeros(n_vocab)
+    inputs = torch.Tensor(S, B, H).normal_()
+
+    logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample)
+    print('logits', logits.detach().numpy().tolist())
+    print('logits shape', logits.size())
+    print('out_labels', out_labels.detach().numpy().tolist())
+    print('out_labels shape', out_labels.size())
+
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py
new file mode 100644
index 0000000000..1d278abcb2
--- /dev/null
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@@ -0,0 +1,508 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization classes for Transformer XL model.
+    Directly adapted from https://github.com/kimiyoung/transformer-xl.
+"""
+
+import os
+import re
+import json
+from tqdm import tqdm
+import logging
+import pickle
+from collections import Counter, OrderedDict
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'transfo-xl': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'openai-gpt': 512,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
+
+class TransfoXLTokenizer(object):
+    """
+    Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
+    """
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a TransfoXLTokenizer.
+        Download and cache the vocabulary if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
+            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
+        except FileNotFoundError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    vocab_file, merges_file))
+            return None
+        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+            logger.info("loading merges file {}".format(merges_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+            logger.info("loading merges file {} from cache at {}".format(
+                merges_file, resolved_merges_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
+        return tokenizer
+
+    def __init__(self, special=[], min_freq=0, max_size=None, lower_case=True,
+                 delimiter=None, vocab_file=None):
+        self.counter = Counter()
+        self.special = special
+        self.min_freq = min_freq
+        self.max_size = max_size
+        self.lower_case = lower_case
+        self.delimiter = delimiter
+        self.vocab_file = vocab_file
+
+    def count_file(self, path, verbose=False, add_eos=False):
+        if verbose: print('counting file {} ...'.format(path))
+        assert os.path.exists(path)
+
+        sents = []
+        with open(path, 'r', encoding='utf-8') as f:
+            for idx, line in enumerate(f):
+                if verbose and idx > 0 and idx % 500000 == 0:
+                    print('    line {}'.format(idx))
+                symbols = self.tokenize(line, add_eos=add_eos)
+                self.counter.update(symbols)
+                sents.append(symbols)
+
+        return sents
+
+    def count_sents(self, sents, verbose=False):
+        """
+            sents : a list of sentences, each a list of tokenized symbols
+        """
+        if verbose: print('counting {} sents ...'.format(len(sents)))
+        for idx, symbols in enumerate(sents):
+            if verbose and idx > 0 and idx % 500000 == 0:
+                print('    line {}'.format(idx))
+            self.counter.update(symbols)
+
+    def _build_from_file(self, vocab_file):
+        self.idx2sym = []
+        self.sym2idx = OrderedDict()
+
+        with open(vocab_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                symb = line.strip().split()[0]
+                self.add_symbol(symb)
+        self.unk_idx = self.sym2idx['<UNK>']
+
+    def build_vocab(self):
+        if self.vocab_file:
+            print('building vocab from {}'.format(self.vocab_file))
+            self._build_from_file(self.vocab_file)
+            print('final vocab size {}'.format(len(self)))
+        else:
+            print('building vocab with min_freq={}, max_size={}'.format(
+                self.min_freq, self.max_size))
+            self.idx2sym = []
+            self.sym2idx = OrderedDict()
+
+            for sym in self.special:
+                self.add_special(sym)
+
+            for sym, cnt in self.counter.most_common(self.max_size):
+                if cnt < self.min_freq: break
+                self.add_symbol(sym)
+
+            print('final vocab size {} from {} unique tokens'.format(
+                len(self), len(self.counter)))
+
+    def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
+            add_double_eos=False):
+        if verbose: print('encoding file {} ...'.format(path))
+        assert os.path.exists(path)
+        encoded = []
+        with open(path, 'r', encoding='utf-8') as f:
+            for idx, line in enumerate(f):
+                if verbose and idx > 0 and idx % 500000 == 0:
+                    print('    line {}'.format(idx))
+                symbols = self.tokenize(line, add_eos=add_eos,
+                    add_double_eos=add_double_eos)
+                encoded.append(self.convert_to_tensor(symbols))
+
+        if ordered:
+            encoded = torch.cat(encoded)
+
+        return encoded
+
+    def encode_sents(self, sents, ordered=False, verbose=False):
+        if verbose: print('encoding {} sents ...'.format(len(sents)))
+        encoded = []
+        for idx, symbols in enumerate(sents):
+            if verbose and idx > 0 and idx % 500000 == 0:
+                print('    line {}'.format(idx))
+            encoded.append(self.convert_to_tensor(symbols))
+
+        if ordered:
+            encoded = torch.cat(encoded)
+
+        return encoded
+
+    def add_special(self, sym):
+        if sym not in self.sym2idx:
+            self.idx2sym.append(sym)
+            self.sym2idx[sym] = len(self.idx2sym) - 1
+            setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym])
+
+    def add_symbol(self, sym):
+        if sym not in self.sym2idx:
+            self.idx2sym.append(sym)
+            self.sym2idx[sym] = len(self.idx2sym) - 1
+
+    def get_sym(self, idx):
+        assert 0 <= idx < len(self), 'Index {} out of range'.format(idx)
+        return self.idx2sym[idx]
+
+    def get_idx(self, sym):
+        if sym in self.sym2idx:
+            return self.sym2idx[sym]
+        else:
+            # print('encounter unk {}'.format(sym))
+            assert '<eos>' not in sym
+            assert hasattr(self, 'unk_idx')
+            return self.sym2idx.get(sym, self.unk_idx)
+
+    def convert_ids_to_tokens(self, indices):
+        """Converts a sequence of indices in symbols using the vocab."""
+        return [self.get_sym(idx) for idx in indices]
+
+    def convert_tokens_to_ids(self, symbols):
+        """Converts a sequence of symbols into ids using the vocab."""
+        return [self.get_idx(sym) for sym in symbols]
+
+    def convert_to_tensor(self, symbols):
+        return torch.LongTensor(self.convert_tokens_to_ids(symbols))
+
+    def decode(self, indices, exclude=None):
+        """Converts a sequence of indices in a string."""
+        if exclude is None:
+            return ' '.join([self.get_sym(idx) for idx in indices])
+        else:
+            return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude])
+
+    def __len__(self):
+        return len(self.idx2sym)
+
+    def tokenize(self, line, add_eos=False, add_double_eos=False):
+        line = line.strip()
+        # convert to lower case
+        if self.lower_case:
+            line = line.lower()
+
+        # empty delimiter '' will evaluate False
+        if self.delimiter == '':
+            symbols = line
+        else:
+            symbols = line.split(self.delimiter)
+
+        if add_double_eos: # lm1b
+            return ['<S>'] + symbols + ['<S>']
+        elif add_eos:
+            return symbols + ['<eos>']
+        else:
+            return symbols
+
+
+class LMOrderedIterator(object):
+    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None):
+        """
+            data -- LongTensor -- the LongTensor is strictly ordered
+        """
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+
+        # Work out how cleanly we can divide the dataset into bsz parts.
+        self.n_step = data.size(0) // bsz
+
+        # Trim off any extra elements that wouldn't cleanly fit (remainders).
+        data = data.narrow(0, 0, self.n_step * bsz)
+
+        # Evenly divide the data across the bsz batches.
+        self.data = data.view(bsz, -1).t().contiguous().to(device)
+
+        # Number of mini-batches
+        self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
+
+    def get_batch(self, i, bptt=None):
+        if bptt is None: bptt = self.bptt
+        seq_len = min(bptt, self.data.size(0) - 1 - i)
+
+        end_idx = i + seq_len
+        beg_idx = max(0, i - self.ext_len)
+
+        data = self.data[beg_idx:end_idx]
+        target = self.data[i+1:i+1+seq_len]
+
+        return data, target, seq_len
+
+    def get_fixlen_iter(self, start=0):
+        for i in range(start, self.data.size(0) - 1, self.bptt):
+            yield self.get_batch(i)
+
+    def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):
+        max_len = self.bptt + max_deviation * std
+        i = start
+        while True:
+            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
+            bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
+            data, target, seq_len = self.get_batch(i, bptt)
+            i += seq_len
+            yield data, target, seq_len
+            if i >= self.data.size(0) - 2:
+                break
+
+    def __iter__(self):
+        return self.get_fixlen_iter()
+
+
+class LMShuffledIterator(object):
+    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False):
+        """
+            data -- list[LongTensor] -- there is no order among the LongTensors
+        """
+        self.data = data
+
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+        self.shuffle = shuffle
+
+    def get_sent_stream(self):
+        # index iterator
+        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \
+            else np.array(range(len(self.data)))
+
+        # sentence iterator
+        for idx in epoch_indices:
+            yield self.data[idx]
+
+    def stream_iterator(self, sent_stream):
+        # streams for each data in the batch
+        streams = [None] * self.bsz
+
+        data = torch.LongTensor(self.bptt, self.bsz)
+        target = torch.LongTensor(self.bptt, self.bsz)
+
+        n_retain = 0
+
+        while True:
+            # data   : [n_retain+bptt x bsz]
+            # target : [bptt x bsz]
+            data[n_retain:].fill_(-1)
+            target.fill_(-1)
+
+            valid_batch = True
+
+            for i in range(self.bsz):
+                n_filled = 0
+                try:
+                    while n_filled < self.bptt:
+                        if streams[i] is None or len(streams[i]) <= 1:
+                            streams[i] = next(sent_stream)
+                        # number of new tokens to fill in
+                        n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
+                        # first n_retain tokens are retained from last batch
+                        data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \
+                            streams[i][:n_new]
+                        target[n_filled:n_filled+n_new, i] = \
+                            streams[i][1:n_new+1]
+                        streams[i] = streams[i][n_new:]
+                        n_filled += n_new
+                except StopIteration:
+                    valid_batch = False
+                    break
+
+            if not valid_batch:
+                return
+
+            data = data.to(self.device)
+            target = target.to(self.device)
+
+            yield data, target, self.bptt
+
+            n_retain = min(data.size(0), self.ext_len)
+            if n_retain > 0:
+                data[:n_retain] = data[-n_retain:]
+            data.resize_(n_retain + self.bptt, data.size(1))
+
+    def __iter__(self):
+        # sent_stream is an iterator
+        sent_stream = self.get_sent_stream()
+
+        for batch in self.stream_iterator(sent_stream):
+            yield batch
+
+
+class LMMultiFileIterator(LMShuffledIterator):
+    def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None,
+        shuffle=False):
+
+        self.paths = paths
+        self.vocab = vocab
+
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+        self.shuffle = shuffle
+
+    def get_sent_stream(self, path):
+        sents = self.vocab.encode_file(path, add_double_eos=True)
+        if self.shuffle:
+            np.random.shuffle(sents)
+        sent_stream = iter(sents)
+
+        return sent_stream
+
+    def __iter__(self):
+        if self.shuffle:
+            np.random.shuffle(self.paths)
+
+        for path in self.paths:
+            # sent_stream is an iterator
+            sent_stream = self.get_sent_stream(path)
+            for batch in self.stream_iterator(sent_stream):
+                yield batch
+
+
+class Corpus(object):
+    def __init__(self, path, dataset, *args, **kwargs):
+        self.dataset = dataset
+        self.vocab = Vocab(*args, **kwargs)
+
+        if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']:
+            self.vocab.count_file(os.path.join(path, 'train.txt'))
+            self.vocab.count_file(os.path.join(path, 'valid.txt'))
+            self.vocab.count_file(os.path.join(path, 'test.txt'))
+        elif self.dataset == 'wt103':
+            self.vocab.count_file(os.path.join(path, 'train.txt'))
+        elif self.dataset == 'lm1b':
+            train_path_pattern = os.path.join(
+                path, '1-billion-word-language-modeling-benchmark-r13output',
+                'training-monolingual.tokenized.shuffled', 'news.en-*')
+            train_paths = glob.glob(train_path_pattern)
+            # the vocab will load from file when build_vocab() is called
+
+        self.vocab.build_vocab()
+
+        if self.dataset in ['ptb', 'wt2', 'wt103']:
+            self.train = self.vocab.encode_file(
+                os.path.join(path, 'train.txt'), ordered=True)
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=True)
+            self.test  = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=True)
+        elif self.dataset in ['enwik8', 'text8']:
+            self.train = self.vocab.encode_file(
+                os.path.join(path, 'train.txt'), ordered=True, add_eos=False)
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=True, add_eos=False)
+            self.test  = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=True, add_eos=False)
+        elif self.dataset == 'lm1b':
+            self.train = train_paths
+            self.valid = self.vocab.encode_file(
+                os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True)
+            self.test  = self.vocab.encode_file(
+                os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True)
+
+    def get_iterator(self, split, *args, **kwargs):
+        if split == 'train':
+            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+                data_iter = LMOrderedIterator(self.train, *args, **kwargs)
+            elif self.dataset == 'lm1b':
+                kwargs['shuffle'] = True
+                data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
+        elif split in ['valid', 'test']:
+            data = self.valid if split == 'valid' else self.test
+            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
+                data_iter = LMOrderedIterator(data, *args, **kwargs)
+            elif self.dataset == 'lm1b':
+                data_iter = LMShuffledIterator(data, *args, **kwargs)
+
+        return data_iter
+
+
+def get_lm_corpus(datadir, dataset):
+    fn = os.path.join(datadir, 'cache.pt')
+    fn_pickle = os.path.join(datadir, 'cache.pkl')
+    if os.path.exists(fn):
+        print('Loading cached dataset...')
+        corpus = torch.load(fn_pickle)
+    elif os.path.exists(fn):
+        print('Loading cached dataset from pickle...')
+        with open(fn, "rb") as fp:
+            corpus = pickle.load(fp)
+    else:
+        print('Producing dataset {}...'.format(dataset))
+        kwargs = {}
+        if dataset in ['wt103', 'wt2']:
+            kwargs['special'] = ['<eos>']
+            kwargs['lower_case'] = False
+        elif dataset == 'ptb':
+            kwargs['special'] = ['<eos>']
+            kwargs['lower_case'] = True
+        elif dataset == 'lm1b':
+            kwargs['special'] = []
+            kwargs['lower_case'] = False
+            kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt')
+        elif dataset in ['enwik8', 'text8']:
+            pass
+
+        corpus = Corpus(datadir, dataset, **kwargs)
+        torch.save(corpus, fn)
+
+    return corpus

From 7d03c53718a190a03579a778fbd16fdb05d0f807 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 15 Jan 2019 16:07:25 +0100
Subject: [PATCH 09/82] conversion working

---
 .gitignore                                    |   5 +-
 pytorch_pretrained_bert/__main__.py           |  27 +-
 .../convert_openai_checkpoint_to_pytorch.py   |   2 -
 ...onvert_transfo_xl_checkpoint_to_pytorch.py | 141 ++++--
 .../modeling_transfo_xl.py                    | 402 +++---------------
 5 files changed, 193 insertions(+), 384 deletions(-)

diff --git a/.gitignore b/.gitignore
index 56a5f0d38a..d7489f88e8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -119,4 +119,7 @@ dmypy.json
 .vscode
 
 # TF code
-tensorflow_code
\ No newline at end of file
+tensorflow_code
+
+# models
+models
\ No newline at end of file
diff --git a/pytorch_pretrained_bert/__main__.py b/pytorch_pretrained_bert/__main__.py
index 1557adc63f..731d87f26a 100644
--- a/pytorch_pretrained_bert/__main__.py
+++ b/pytorch_pretrained_bert/__main__.py
@@ -3,9 +3,14 @@ def main():
     import sys
     if (len(sys.argv) != 4 and len(sys.argv) != 5) or sys.argv[1] not in [
         "convert_tf_checkpoint_to_pytorch",
-        "convert_openai_checkpoint"
+        "convert_openai_checkpoint",
+        "convert_transfo_xl_checkpoint"
     ]:
-        print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT` \n or `pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
+        print(
+        "Should be used as"
+        "`pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
+        "`pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]` or \n"
+        "`pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
     else:
         if sys.argv[1] == "convert_tf_checkpoint_to_pytorch":
             try:
@@ -24,7 +29,7 @@ def main():
                 TF_CONFIG = sys.argv.pop()
                 TF_CHECKPOINT = sys.argv.pop()
                 convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
-        else:
+        elif sys.argv[1] == "convert_openai_checkpoint":
             from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
             OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
             PYTORCH_DUMP_OUTPUT = sys.argv[3]
@@ -35,6 +40,22 @@ def main():
             convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
                                                  OPENAI_GPT_CONFIG,
                                                  PYTORCH_DUMP_OUTPUT)
+        else:
+            try:
+                from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
+            except ModuleNotFoundError:
+                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
+            TF_CHECKPOINT = sys.argv[2]
+            PYTORCH_DUMP_OUTPUT = sys.argv[3]
+            if len(sys.argv) == 5:
+                TF_CONFIG = sys.argv[4]
+            else:
+                TF_CONFIG = ""
+            convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
 
 if __name__ == '__main__':
     main()
diff --git a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
index 2e25d16e61..0c41741d9a 100755
--- a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
@@ -18,11 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import re
 import json
 import argparse
-import tensorflow as tf
 import torch
 import numpy as np
 
diff --git a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
index 03f71defd6..861c26280d 100755
--- a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -25,7 +25,72 @@ import tensorflow as tf
 import torch
 import numpy as np
 
-from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel, CONFIG_NAME, WEIGHTS_NAME
+from pytorch_pretrained_bert.modeling_transfo_xl import TransfoXLConfig, TransfoXLModel, CONFIG_NAME, WEIGHTS_NAME
+
+def build_tf_to_pytorch_map(model, config):
+    """ A map of modules from TF to PyTorch """
+    tf_to_pt_map = {}
+    # Embeddings cutoffs
+    for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
+        layer_str = "transformer/adaptive_embed/cutoff_%d/" % i
+        tf_to_pt_map.update({
+            layer_str + 'lookup_table': embed_l.weight,
+            layer_str + 'proj_W': proj_l
+            })
+
+    # Transformer blocks
+    for i, b in enumerate(model.layers):
+        layer_str = "transformer/layer_%d/" % i
+        tf_to_pt_map.update({
+            layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight,
+            layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias,
+            layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight,
+            layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight,
+            layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight,
+            layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight,
+            layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias,
+            layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight,
+            layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias,
+            layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight,
+            layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
+        })
+
+    # Softmax cutoffs
+    for i, (out_l, proj_l, tie_proj) in enumerate(zip(
+                            model.crit.out_layers,
+                            model.crit.out_projs,
+                            config.tie_projs)):
+        layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i
+        if config.tie_weight:
+            tf_to_pt_map.update({
+                layer_str + 'b': out_l.bias})
+        else:
+            raise NotImplementedError
+            # I don't think this is implemented in the TF code
+            tf_to_pt_map.update({
+                layer_str + 'lookup_table': out_l.weight,
+                layer_str + 'b': out_l.bias})
+        if not tie_proj:
+            tf_to_pt_map.update({
+                layer_str + 'proj': proj_l
+                })
+
+    # Relative positioning biases
+    if config.untie_r:
+        layer_str = "transformer/r_r_bias"
+        layer_str_2 = "transformer/r_w_bias"
+        r_r_list = []
+        r_w_list = []
+        for b in model.layers:
+            r_r_list.append(b.dec_attn.r_r_bias)
+            r_w_list.append(b.dec_attn.r_w_bias)
+    else:
+        r_r_list = [model.r_r_bias]
+        r_w_list = [model.r_w_bias]
+    tf_to_pt_map.update({
+        'transformer/r_r_bias': r_r_list,
+        'transformer/r_w_bias': r_w_list})
+    return tf_to_pt_map
 
 
 def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
@@ -35,16 +100,6 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
     tf_path = os.path.abspath(tf_checkpoint_path)
 
     print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
     # Initialise PyTorch model
     # Construct model
     if transfo_xl_config_file == "":
@@ -54,34 +109,37 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
     print("Building PyTorch model from configuration: {}".format(str(config)))
     model = TransfoXLModel(config)
 
-    for name, array in zip(names, arrays):
-        name = name.split('/')
+    # Build TF to PyTorch weights loading map
+    tf_to_pt_map = build_tf_to_pytorch_map(model.transformer, config)
+
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    tf_weights = {}
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        tf_weights[name] = array
+
+    for name, pointer in tf_to_pt_map.items():
+        assert name in tf_weights
+        array = tf_weights[name]
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
-        if any(n in ["adam_v", "adam_m"] for n in name):
-            print("Skipping {}".format("/".join(name)))
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
-            else:
-                l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            else:
-                pointer = getattr(pointer, l[0])
-            if len(l) >= 2:
-                num = int(l[1])
-                pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
+        if 'kernel' in name or 'proj_W' in name:
             array = np.transpose(array)
+        if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:
+            # Here we will split the TF weigths
+            assert len(pointer) == array.shape[0]
+            for i, p_i in enumerate(pointer):
+                arr_i = array[i, ...]
+                try:
+                    assert p_i.shape == arr_i.shape
+                except AssertionError as e:
+                    e.args += (p_i.shape, arr_i.shape)
+                    raise
+                print("Initialize PyTorch weight {} for layer {}".format(name, i))
+                p_i.data = torch.from_numpy(arr_i)
+            continue
         try:
             assert pointer.shape == array.shape
         except AssertionError as e:
@@ -108,17 +166,16 @@ if __name__ == "__main__":
                         type = str,
                         required = True,
                         help = "Path the TensorFlow checkpoint path.")
-    parser.add_argument("--transfo_xl_config_file",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "The config json file corresponding to the pre-trained BERT model. \n"
-                            "This specifies the model architecture.")
     parser.add_argument("--pytorch_dump_folder_path",
                         default = None,
                         type = str,
                         required = True,
                         help = "Path to the output PyTorch model.")
+    parser.add_argument("--transfo_xl_config_file",
+                        default = "",
+                        type = str,
+                        help = "The config json file corresponding to the pre-trained BERT model. \n"
+                            "This specifies the model architecture.")
     args = parser.parse_args()
     convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path,
                                      args.transfo_xl_config_file,
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index fccd4616e4..a7a9ca2e5b 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -34,6 +34,7 @@ from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
 from .modeling import BertLayerNorm as LayerNorm
+from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
 from .file_utils import cached_path
 
 logger = logging.getLogger(__name__)
@@ -50,25 +51,26 @@ class TransfoXLConfig(object):
     def __init__(self,
                  vocab_size_or_config_json_file=267735,
                  cutoffs=[20000, 40000, 200000],
-                 d_model=410,
-                 d_embed=410,
-                 d_head=41,
-                 d_inner=2100,
-                 div_val=1.0,
+                 d_model=1024,
+                 d_embed=1024,
+                 n_head=16,
+                 d_head=64,
+                 d_inner=4096,
+                 div_val=4,
                  pre_lnorm=False,
-                 n_layer=16,
-                 n_head=10,
-                 tgt_len=150,
+                 n_layer=18,
+                 tgt_len=256,
                  ext_len=0,
-                 mem_len=150,
+                 mem_len=256,
                  same_length=False,
                  attn_type=0,
                  clamp_len=-1,
                  sample_softmax=-1,
                  adaptive=True,
-                 tied=True,
+                 tie_weight=True,
                  dropout=0.1,
                  dropatt=0.0,
+                 untie_r=True,
                  init="normal",
                  init_range=0.01,
                  proj_init_std=0.01,
@@ -95,10 +97,11 @@ class TransfoXLConfig(object):
             clamp_len: use the same pos embeddings after clamp_len
             sample_softmax: number of samples in sampled softmax
             adaptive: use adaptive softmax
-            tied: tie the word embedding and softmax weights
+            tie_weight: tie the word embedding and softmax weights
             dropout: The dropout probabilitiy for all fully connected
                 layers in the embeddings, encoder, and pooler.
             dropatt: The dropout ratio for the attention probabilities.
+            untie_r: untie relative position biases           
             embd_pdrop: The dropout ratio for the embeddings.
             init: parameter initializer to use
             init_range: parameters initialized by U(-init_range, init_range).
@@ -111,9 +114,10 @@ class TransfoXLConfig(object):
             for key, value in json_config.items():
                 self.__dict__[key] = value
         elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
+            self.n_token = vocab_size_or_config_json_file
             self.cutoffs = []
             self.cutoffs.extend(cutoffs)
+            self.tie_weight = tie_weight
             self.tie_projs = [False] + [True] * len(self.cutoffs)
             self.d_model = d_model
             self.d_embed = d_embed
@@ -131,9 +135,9 @@ class TransfoXLConfig(object):
             self.clamp_len = clamp_len
             self.sample_softmax = sample_softmax
             self.adaptive = adaptive
-            self.tied = tied
             self.dropout = dropout
             self.dropatt = dropatt
+            self.untie_r = untie_r
             self.init = init
             self.init_range = init_range
             self.proj_init_std = proj_init_std
@@ -142,10 +146,6 @@ class TransfoXLConfig(object):
             raise ValueError("First argument must be either a vocabulary size (int)"
                              "or the path to a pretrained model config file (str)")
 
-    @property
-    def total_num_embeddings(self):
-        return self.vocab_size + self.n_special + self.n_ctx
-
     @classmethod
     def from_dict(cls, json_object):
         """Constructs a `TransfoXLConfig` from a Python dictionary of parameters."""
@@ -230,7 +230,7 @@ class PositionwiseFF(nn.Module):
 
 class MultiHeadAttn(nn.Module):
     def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, 
-                 pre_lnorm=False):
+                 pre_lnorm=False, r_r_bias=None, r_w_bias=None):
         super(MultiHeadAttn, self).__init__()
 
         self.n_head = n_head
@@ -251,6 +251,13 @@ class MultiHeadAttn(nn.Module):
 
         self.pre_lnorm = pre_lnorm
 
+        if r_r_bias is None or r_w_bias is None: # Biases are not shared
+            self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+            self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+        else:
+            self.r_r_bias = r_r_bias
+            self.r_w_bias = r_w_bias
+
     def forward(self, h, attn_mask=None, mems=None):
         ##### multihead attention
         # [hlen x bsz x n_head x d_head]
@@ -304,7 +311,8 @@ class MultiHeadAttn(nn.Module):
 
 class RelMultiHeadAttn(nn.Module):
     def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
-                 tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False):
+                 tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
+                 r_r_bias=None, r_w_bias=None):
         super(RelMultiHeadAttn, self).__init__()
 
         self.n_head = n_head
@@ -324,6 +332,13 @@ class RelMultiHeadAttn(nn.Module):
 
         self.pre_lnorm = pre_lnorm
 
+        if r_r_bias is None or r_w_bias is None: # Biases are not shared
+            self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+            self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+        else:
+            self.r_r_bias = r_r_bias
+            self.r_w_bias = r_w_bias
+
     def _parallelogram_mask(self, h, w, left=False):
         mask = torch.ones((h, w)).byte()
         m = min(h, w)
@@ -377,7 +392,7 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
 
         self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
 
-    def forward(self, w, r, r_w_bias, r_r_bias, attn_mask=None, mems=None):
+    def forward(self, w, r, attn_mask=None, mems=None):
         qlen, rlen, bsz = w.size(0), r.size(0), w.size(1)
 
         if mems is not None:
@@ -408,10 +423,10 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
         r_head_k = r_head_k.view(rlen, self.n_head, self.d_head)                # qlen x n_head x d_head
 
         #### compute attention score
-        rw_head_q = w_head_q + r_w_bias                                         # qlen x bsz x n_head x d_head
+        rw_head_q = w_head_q + self.r_w_bias                                         # qlen x bsz x n_head x d_head
         AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head
 
-        rr_head_q = w_head_q + r_r_bias
+        rr_head_q = w_head_q + self.r_r_bias
         BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k))              # qlen x klen x bsz x n_head
         BD = self._rel_shift(BD)
 
@@ -582,9 +597,9 @@ class RelPartialLearnableDecoderLayer(nn.Module):
         self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, 
                                      pre_lnorm=kwargs.get('pre_lnorm'))
 
-    def forward(self, dec_inp, r, r_w_bias, r_r_bias, dec_attn_mask=None, mems=None):
+    def forward(self, dec_inp, r, dec_attn_mask=None, mems=None):
 
-        output = self.dec_attn(dec_inp, r, r_w_bias, r_r_bias,
+        output = self.dec_attn(dec_inp, r,
                                attn_mask=dec_attn_mask,
                                mems=mems)
         output = self.pos_ff(output)
@@ -659,9 +674,9 @@ class MemTransformerLM(nn.Module):
                  dropout, dropatt, tie_weight=True, d_embed=None, 
                  div_val=1, tie_projs=[False], pre_lnorm=False,
                  tgt_len=None, ext_len=None, mem_len=None, 
-                 cutoffs=[], adapt_inp=False,
+                 cutoffs=[], adapt_inp=False, untie_r=False,
                  same_length=False, attn_type=0, clamp_len=-1, 
-                 sample_softmax=-1):
+                 sample_softmax=-1, **kwargs):
         super(MemTransformerLM, self).__init__()
         self.n_token = n_token
 
@@ -685,6 +700,10 @@ class MemTransformerLM(nn.Module):
 
         self.attn_type = attn_type
 
+        if not untie_r:
+            self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+            self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
+
         self.layers = nn.ModuleList()
         if attn_type == 0: # the default attention
             for i in range(n_layer):
@@ -692,7 +711,9 @@ class MemTransformerLM(nn.Module):
                     RelPartialLearnableDecoderLayer(
                         n_head, d_model, d_head, d_inner, dropout,
                         tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len,
-                        dropatt=dropatt, pre_lnorm=pre_lnorm)
+                        dropatt=dropatt, pre_lnorm=pre_lnorm,
+                        r_w_bias=None if untie_r else self.r_w_bias,
+                        r_r_bias=None if untie_r else self.r_r_bias)
                 )
         elif attn_type == 1: # learnable embeddings
             for i in range(n_layer):
@@ -700,14 +721,18 @@ class MemTransformerLM(nn.Module):
                     RelLearnableDecoderLayer(
                         n_head, d_model, d_head, d_inner, dropout,
                         tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len,
-                        dropatt=dropatt, pre_lnorm=pre_lnorm)
+                        dropatt=dropatt, pre_lnorm=pre_lnorm,
+                        r_w_bias=None if untie_r else self.r_w_bias,
+                        r_r_bias=None if untie_r else self.r_r_bias)
                 )
         elif attn_type in [2, 3]: # absolute embeddings
             for i in range(n_layer):
                 self.layers.append(
                     DecoderLayer(
                         n_head, d_model, d_head, d_inner, dropout,
-                        dropatt=dropatt, pre_lnorm=pre_lnorm)
+                        dropatt=dropatt, pre_lnorm=pre_lnorm,
+                        r_w_bias=None if untie_r else self.r_w_bias,
+                        r_r_bias=None if untie_r else self.r_r_bias)
                 )
 
         self.sample_softmax = sample_softmax
@@ -738,21 +763,11 @@ class MemTransformerLM(nn.Module):
         self.same_length = same_length
         self.clamp_len = clamp_len
 
-        self._create_params()
-
-    def backward_compatible(self):
-        self.sample_softmax = -1
-
-    def _create_params(self):
         if self.attn_type == 0: # default attention
             self.pos_emb = PositionalEmbedding(self.d_model)
-            self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
-            self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
         elif self.attn_type == 1: # learnable
             self.r_emb = nn.Parameter(torch.Tensor(
                     self.n_layer, self.max_klen, self.n_head, self.d_head))
-            self.r_w_bias = nn.Parameter(torch.Tensor(
-                    self.n_layer, self.n_head, self.d_head))
             self.r_bias = nn.Parameter(torch.Tensor(
                     self.n_layer, self.max_klen, self.n_head))
         elif self.attn_type == 2: # absolute standard
@@ -761,6 +776,10 @@ class MemTransformerLM(nn.Module):
             self.r_emb = nn.Parameter(torch.Tensor(
                     self.n_layer, self.max_klen, self.n_head, self.d_head))
 
+    def backward_compatible(self):
+        self.sample_softmax = -1
+
+
     def reset_length(self, tgt_len, ext_len, mem_len):
         self.tgt_len = tgt_len
         self.mem_len = mem_len
@@ -937,13 +956,13 @@ class TransfoXLPreTrainedModel(nn.Module):
                 ))
         self.config = config
 
-    def init_weight(weight):
+    def init_weight(self, weight):
         if self.config.init == 'uniform':
             nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
         elif self.config.init == 'normal':
             nn.init.normal_(weight, 0.0, self.config.init_std)
 
-    def init_bias(bias):
+    def init_bias(self, bias):
         nn.init.constant_(bias, 0.0)
 
     def init_weights(self, m):
@@ -1100,89 +1119,11 @@ class TransfoXLPreTrainedModel(nn.Module):
         return model
 
 
-
-
-
-
-###################
-
-
-
-
-class TransfoXLLMHead(nn.Module):
-    """ Language Model Head for the transformer """
-
-    def __init__(self, model_embeddings_weights, config):
-        super(TransfoXLLMHead, self).__init__()
-        self.n_embd = config.n_embd
-        self.set_embeddings_weights(model_embeddings_weights)
-
-    def set_embeddings_weights(self, model_embeddings_weights):
-        embed_shape = model_embeddings_weights.shape
-        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
-        self.decoder.weight = model_embeddings_weights # Tied weights
-
-    def forward(self, hidden_state):
-        # Truncated Language modeling logits (we remove the last token)
-        # h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
-        lm_logits = self.decoder(hidden_state)
-        return lm_logits
-
-
-class TransfoXLMultipleChoiceHead(nn.Module):
-    """ Classifier Head for the transformer """
-
-    def __init__(self, config):
-        super(TransfoXLMultipleChoiceHead, self).__init__()
-        self.n_embd = config.n_embd
-        # self.multiple_choice_token = multiple_choice_token
-        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
-        self.linear = nn.Linear(config.n_embd, 1)
-
-        nn.init.normal_(self.linear.weight, std = 0.02)
-        nn.init.normal_(self.linear.bias, 0)
-
-    def forward(self, hidden_states, multiple_choice_token_mask):
-        # Classification logits
-        # hidden_states = hidden_states.view(-1, self.n_embd)
-        # multiple_choice_token_mask = multiple_choice_token_mask.view(-1, 1).expand_as(hidden_states)
-        multiple_choice_h = hidden_states * multiple_choice_token_mask.unsqueeze(-1)
-        multiple_choice_h = multiple_choice_h.sum(dim=-2)
-        # flat = x[..., 0].contiguous().view(-1)
-        # multiple_choice_h = multiple_choice_h[flat == self.multiple_choice_token, :]
-        # multiple_choice_h = multiple_choice_h.view(-1, x.size(1), self.n_embd, 1)
-        # # This double transposition is there to replicate the behavior
-        # # of the noise_shape argument in the tensorflow
-        # # implementation.  For more details, see
-        # # https://github.com/huggingface/pytorch-openai-transformer-lm/issues/11
-        # multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
-        # multiple_choice_h = multiple_choice_h.contiguous().view(-1, self.n_embd)
-        multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
-        return multiple_choice_logits
-
-
 class TransfoXLModel(TransfoXLPreTrainedModel):
-    """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
-
-    The main implementation difference between BERT and the OpenAI is the use, in OpenAI GPT, of a single embedding matrix
-    to store the word, special ([SEP], [CLS]...) and position embeddings.
-    The embeddings are ordered as follow in the word embeddings matrice:
-        [0,                                                         ----------------------
-         ...                                                        -> word embeddings
-         config.vocab_size - 1,                                     ______________________
-         config.vocab_size,
-         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1,                  ______________________
-         config.vocab_size + config.n_special,
-         ...                                                        -> position embeddings
-         total_num_embeddings - 1]                                  ______________________
-
-    where total_num_embeddings can be obtained as config.total_num_embeddings and is:
-        total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
-    You should use the associate indices to index the embeddings.
-
-    The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+    """ Transformer XL model
+        From "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
+        by Zihang Dai*, Zhilin Yang*, Yiming Yang, William W. Cohen, Jaime Carbonell,
+           Quoc V. Le, Ruslan Salakhutdinov (*: equal contribution)
 
     Params:
         config: a TransfoXLConfig class instance with the configuration to build a new model
@@ -1214,219 +1155,8 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
     """
     def __init__(self, config):
         super(TransfoXLModel, self).__init__(config)
-        total_embeddings_size = config.vocab_size + config.n_special + config.n_ctx
-        self.embed = nn.Embedding(total_embeddings_size, config.n_embd)
-        self.drop = nn.Dropout(config.embd_pdrop)
-        block = Block(config.n_ctx, config, scale=True)
-        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
-
+        self.transformer = MemTransformerLM(**config.to_dict())
         self.apply(self.init_weights)
-        # nn.init.normal_(self.embed.weight, std=0.02)
-
-    def set_num_special_tokens(self, num_special_tokens):
-        " Update input embeddings with new embedding matrice "
-        # Update config
-        self.config.n_special = num_special_tokens
-        # # Build new embeddings and initialize
-        old_embed = self.embed
-        self.embed = nn.Embedding(self.config.total_num_embeddings, self.config.n_embd)
-        # Initialize all new embeddings (in particular the special tokens)
-        self.init_weights(self.embed)
-        # Copy word and positional embeddings from the previous weights
-        self.embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
-        self.embed.weight.data[-self.config.n_ctx:, :] = old_embed.weight.data[-self.config.n_ctx:, :]
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None):
-        if position_ids is None:
-            start = self.config.vocab_size + self.config.n_special
-            end = start + input_ids.size(-1)
-            position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)
-            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
-
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_ids.size(-1))
-        position_ids = position_ids.view(-1, position_ids.size(-1))
-
-        inputs_embeds = self.embed(input_ids)
-        position_embeds = self.embed(position_ids)
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
-            token_type_embeds = self.embed(token_type_ids)
-        else:
-            token_type_embeds = 0
-        # Add the position information to the input embeddings
-        # h = e.sum(dim=2)
-        hidden_states = inputs_embeds + position_embeds + token_type_embeds
-        for block in self.h:
-            hidden_states = block(hidden_states)
-        return hidden_states.view(*input_shape, hidden_states.size(-1))
-
-class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
-    """OpenAI GPT model with a Language Modeling head ("Improving Language Understanding by Generative Pre-Training").
-
-    There are two main implementation differences between BERT and the OpenAI GPT:
-        - the use of an LM loss in OpenAI GPT which means the Transformer is trained to predict the NEXT token for each input token
-            vs. predict the SAME token for BERT (i.e. you need to shift your labels to the right)
-        - the use, in OpenAI GPT, of a single embedding matrix to store the word, special ([SEP], [CLS]...) and position embeddings.
-    The embeddings are ordered as follow in the word embeddings matrice:
-        [0,                                                         ----------------------
-         ...                                                        -> word embeddings
-         config.vocab_size - 1,                                     ______________________
-         config.vocab_size,
-         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1,                  ______________________
-         config.vocab_size + config.n_special,
-         ...                                                        -> position embeddings
-         total_num_embeddings - 1]                                  ______________________
-
-    where total_num_embeddings can be obtained as config.total_num_embeddings and is:
-        total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
-    You should use these indices to index the word, special and position embeddings.
-
-    The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
-
-    Params:
-        config: a TransfoXLConfig class instance with the configuration to build a new model
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_ctx - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third embedding (the previous two being the word and position embeddings)
-            to each token in the sentence.
-        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-
-    Outputs:
-        if `lm_labels` is not `None`:
-            Outputs the language modeling loss.
-        else:
-            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_num_embeddings]
-                (or more generally [d_1, ..., d_n, total_num_embeddings] were d_1 ... d_n are the dimension of input_ids)
-
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-
-    config = modeling_transfo_xl.TransfoXLConfig()
-
-    model = modeling_transfo_xl.TransfoXLLMHeadModel(config)
-    lm_logits = model(input_ids)
-    ```
-    """
-    def __init__(self, config):
-        super(TransfoXLLMHeadModel, self).__init__(config)
-        self.transformer = TransfoXLModel(config)
-        self.lm_head = TransfoXLLMHead(self.transformer.embed.weight, config)
-        self.apply(self.init_weights)
-
-    def set_num_special_tokens(self, num_special_tokens):
-        " Update input and output embeddings with new embedding matrice "
-        self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
-
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None):
-        hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
-        lm_logits = self.lm_head(hidden_states)
-        if lm_labels is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
-            return loss
-        return lm_logits
-
-class TransfoXLDoubleHeadsModel(TransfoXLPreTrainedModel):
-    """OpenAI GPT model with a Language Modeling and a Multiple Choice heads ("Improving Language Understanding by Generative Pre-Training").
-
-    There are two main implementation differences between BERT and the OpenAI GPT:
-        - the use of an LM loss in OpenAI GPT which means the Transformer is trained to predict the NEXT token for each input token
-            vs. predict the SAME token for BERT (i.e. you need to shift your labels to the right)
-        - the use, in OpenAI GPT, of a single embedding matrix to store the word, special ([SEP], [CLS]...) and position embeddings.
-    The embeddings are ordered as follow in the word embeddings matrice:
-        [0,                                                         ----------------------
-         ...                                                        -> word embeddings
-         config.vocab_size - 1,                                     ______________________
-         config.vocab_size,
-         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1,                  ______________________
-         config.vocab_size + config.n_special,
-         ...                                                        -> position embeddings
-         total_num_embeddings - 1]                                  ______________________
-
-    where total_num_embeddings can be obtained as config.total_num_embeddings and is:
-        total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
-    You should use these indices to index the word, special and position embeddings.
-
-    The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
-
-    Params:
-        config: a TransfoXLConfig class instance with the configuration to build a new model
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with the word BPE token indices selected in the range [0, config.vocab_size[
-        `multiple_choice_token_mask`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with a value of 1 were the last hidden state is (usually the [CLS] token) and 0 otherwise.
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [config.vocab_size + config.n_special,
-            config.vocab_size + config.n_special + config.n_ctx - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third embedding (the previous two being the word and position embeddings)
-            to each token in the sentence.
-        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with indices selected in [-1, 0, ..., total_num_embeddings]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., total_num_embeddings]
-        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_choices].
-
-    Outputs:
-        if `lm_labels` and `multiple_choice_labels` are not `None`:
-            Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
-        else: a tuple with
-            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_num_embeddings]
-            `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
-
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    multiple_choice_token_mask = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = modeling_transfo_xl.TransfoXLConfig()
-
-    model = modeling_transfo_xl.TransfoXLLMHeadModel(config)
-    lm_logits, multiple_choice_logits = model(input_ids, multiple_choice_token_mask)
-    ```
-    """
-    def __init__(self, config):
-        super(TransfoXLDoubleHeadsModel, self).__init__(config)
-        self.transformer = TransfoXLModel(config)
-        self.lm_head = TransfoXLLMHead(self.transformer.embed.weight, config)
-        self.multiple_choice_head = TransfoXLMultipleChoiceHead(config)
-        self.apply(self.init_weights)
-
-    def set_num_special_tokens(self, num_special_tokens):
-        " Update input and output embeddings with new embedding matrice "
-        self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
-
-    def forward(self, input_ids, multiple_choice_token_mask, position_ids=None, token_type_ids=None,
-                lm_labels=None, multiple_choice_labels=None):
-        hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
-        lm_logits = self.lm_head(hidden_states)
-        multiple_choice_logits = self.multiple_choice_head(hidden_states, multiple_choice_token_mask)
-        losses = []
-        if lm_labels is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
-        if multiple_choice_labels is not None:
-            loss_fct = CrossEntropyLoss()
-            losses.append(loss_fct(multiple_choice_logits, multiple_choice_labels.view(-1)))
-        if losses:
-            return losses
-        return lm_logits, multiple_choice_logits
+        return self.transformer(input_ids, position_ids, token_type_ids)

From a69ec2c7220382e568423e6fb88e43b588b4d73c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 15 Jan 2019 23:17:46 +0100
Subject: [PATCH 10/82] improved corpus and tokenization conversion - added
 evaluation script

---
 examples/eval_transfo_xl.py                   | 151 ++++++++++++++++
 pytorch_pretrained_bert/__init__.py           |   2 +
 ...onvert_transfo_xl_checkpoint_to_pytorch.py | 167 +++++++++++-------
 .../modeling_transfo_xl.py                    |  40 +++--
 .../tokenization_transfo_xl.py                | 103 +++++++----
 5 files changed, 344 insertions(+), 119 deletions(-)
 create mode 100644 examples/eval_transfo_xl.py

diff --git a/examples/eval_transfo_xl.py b/examples/eval_transfo_xl.py
new file mode 100644
index 0000000000..886e826b2c
--- /dev/null
+++ b/examples/eval_transfo_xl.py
@@ -0,0 +1,151 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Transformer XL model evaluation script.
+    Adapted from https://github.com/kimiyoung/transformer-xl.
+    In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
+"""
+import os
+import sys
+import functools
+import argparse
+import time
+import math
+
+import torch
+
+from pytorch_pretrained_bert import TransfoXLModel, TransfoXLCorpus
+
+def logging(s, log_path, print_=True, log_=True):
+    if print_:
+        print(s)
+    if log_:
+        with open(log_path, 'a+') as f_log:
+            f_log.write(s + '\n')
+
+def get_logger(log_path, **kwargs):
+    return functools.partial(logging, log_path=log_path, **kwargs)
+
+parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
+# parser.add_argument('--data', type=str, default='../data/wikitext-103',
+#                     help='location of the data corpus')
+parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
+                    choices=['transfo-xl-wt103'], #, 'lm1b', 'enwik8', 'text8'],
+                    help='pretrained model name')
+parser.add_argument('--split', type=str, default='all',
+                    choices=['all', 'valid', 'test'],
+                    help='which split to evaluate')
+parser.add_argument('--batch_size', type=int, default=10,
+                    help='batch size')
+parser.add_argument('--tgt_len', type=int, default=5,
+                    help='number of tokens to predict')
+parser.add_argument('--ext_len', type=int, default=0,
+                    help='length of the extended context')
+parser.add_argument('--mem_len', type=int, default=0,
+                    help='length of the retained previous heads')
+parser.add_argument('--clamp_len', type=int, default=-1,
+                    help='max positional embedding index')
+parser.add_argument('--cuda', action='store_true',
+                    help='use CUDA')
+parser.add_argument('--work_dir', type=str, required=True,
+                    help='path to the work_dir')
+parser.add_argument('--no_log', action='store_true',
+                    help='do not log the eval result')
+parser.add_argument('--same_length', action='store_true',
+                    help='set same length attention with masking')
+args = parser.parse_args()
+assert args.ext_len >= 0, 'extended context length must be non-negative'
+
+device = torch.device("cuda" if args.cuda else "cpu")
+
+# Get logger
+logging = get_logger(os.path.join(args.work_dir, 'log.txt'),
+                     log_=not args.no_log)
+
+# Load dataset
+corpus = TransfoXLCorpus.from_pretrained(args.model_name)
+ntokens = len(corpus.vocab)
+
+va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
+    device=device, ext_len=args.ext_len)
+te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
+    device=device, ext_len=args.ext_len)
+
+# Load the best saved model.
+# with open(os.path.join(args.work_dir, 'model.pt'), 'rb') as f:
+#     model = torch.load(f)
+# model.backward_compatible()
+model = TransfoXLModel.from_pretrained(args.model_name)
+model = model.to(device)
+
+logging('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
+       args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
+
+model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
+if args.clamp_len > 0:
+    model.clamp_len = args.clamp_len
+if args.same_length:
+    model.same_length = True
+
+###############################################################################
+# Evaluation code
+###############################################################################
+def evaluate(eval_iter):
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+    total_len, total_loss = 0, 0.
+    start_time = time.time()
+    with torch.no_grad():
+        mems = tuple()
+        for idx, (data, target, seq_len) in enumerate(eval_iter):
+            ret = model(data, target, *mems)
+            loss, mems = ret[0], ret[1:]
+            loss = loss.mean()
+            total_loss += seq_len * loss.item()
+            total_len += seq_len
+        total_time = time.time() - start_time
+    logging('Time : {:.2f}s, {:.2f}ms/segment'.format(
+            total_time, 1000 * total_time / (idx+1)))
+    return total_loss / total_len
+
+# Run on test data.
+if args.split == 'all':
+    test_loss = evaluate(te_iter)
+    valid_loss = evaluate(va_iter)
+elif args.split == 'valid':
+    valid_loss = evaluate(va_iter)
+    test_loss = None
+elif args.split == 'test':
+    test_loss = evaluate(te_iter)
+    valid_loss = None
+
+def format_log(loss, split):
+    if args.dataset in ['enwik8', 'text8']:
+        log_str = '| {0} loss {1:5.2f} | {0} bpc {2:9.5f} '.format(
+            split, loss, loss / math.log(2))
+    else:
+        log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
+            split, loss, math.exp(loss))
+    return log_str
+
+log_str = ''
+if valid_loss is not None:
+    log_str += format_log(valid_loss, 'valid')
+if test_loss is not None:
+    log_str += format_log(test_loss, 'test')
+
+logging('=' * 100)
+logging(log_str)
+logging('=' * 100)
diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index 0a9e41266d..85f2422af6 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -1,12 +1,14 @@
 __version__ = "0.5.0"
 from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
+from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .modeling import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
                        BertForSequenceClassification, BertForMultipleChoice,
                        BertForTokenClassification, BertForQuestionAnswering)
 from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
                               OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
+from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel)
 from .optimization import BertAdam
 from .optimization_openai import OpenAIAdam
 from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
diff --git a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
index 861c26280d..6962481adc 100755
--- a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -12,23 +12,36 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert OpenAI GPT checkpoint."""
+"""Convert Transformer XL checkpoint and datasets."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import os
-import re
+import sys
 import argparse
+import pickle
+
 import tensorflow as tf
 import torch
 import numpy as np
 
 from pytorch_pretrained_bert.modeling_transfo_xl import TransfoXLConfig, TransfoXLModel, CONFIG_NAME, WEIGHTS_NAME
+from pytorch_pretrained_bert.tokenization_transfo_xl import VOCAB_NAME, CORPUS_NAME
+
+# We do this to be able to load the python 2 datasets pickles
+# See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
+import pytorch_pretrained_bert.tokenization_transfo_xl as data_utils
+data_utils.Vocab = data_utils.TransfoXLTokenizer
+data_utils.Corpus = data_utils.TransfoXLCorpus
+sys.modules['data_utils'] = data_utils
+sys.modules['vocabulary'] = data_utils
 
 def build_tf_to_pytorch_map(model, config):
-    """ A map of modules from TF to PyTorch """
+    """ A map of modules from TF to PyTorch.
+        This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
+    """
     tf_to_pt_map = {}
     # Embeddings cutoffs
     for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
@@ -95,88 +108,108 @@ def build_tf_to_pytorch_map(model, config):
 
 def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
                                              transfo_xl_config_file,
-                                             pytorch_dump_folder_path):
-    config_path = os.path.abspath(transfo_xl_config_file)
-    tf_path = os.path.abspath(tf_checkpoint_path)
+                                             pytorch_dump_folder_path,
+                                             transfo_xl_dataset_file):
+    if transfo_xl_dataset_file:
+        with open(transfo_xl_dataset_file, "rb") as fp:
+            corpus = pickle.load(fp, encoding="latin1")
+        # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
+        pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME
+        print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
+        torch.save(corpus.vocab.__dict__, pytorch_vocab_dump_path)
 
-    print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
-    # Initialise PyTorch model
-    # Construct model
-    if transfo_xl_config_file == "":
-        config = TransfoXLConfig()
-    else:
-        config = TransfoXLConfig(transfo_xl_config_file)
-    print("Building PyTorch model from configuration: {}".format(str(config)))
-    model = TransfoXLModel(config)
+        corpus_dict_no_vocab = corpus.__dict__
+        corpus_dict_no_vocab.pop('vocab', None)
+        pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME
+        print("Save dataset to {}".format(pytorch_dataset_dump_path))
+        torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
 
-    # Build TF to PyTorch weights loading map
-    tf_to_pt_map = build_tf_to_pytorch_map(model.transformer, config)
+    if tf_checkpoint_path:
+        config_path = os.path.abspath(transfo_xl_config_file)
+        tf_path = os.path.abspath(tf_checkpoint_path)
 
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    tf_weights = {}
-    for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        tf_weights[name] = array
+        print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
+        # Initialise PyTorch model
+        # Construct model
+        if transfo_xl_config_file == "":
+            config = TransfoXLConfig()
+        else:
+            config = TransfoXLConfig(transfo_xl_config_file)
+        print("Building PyTorch model from configuration: {}".format(str(config)))
+        model = TransfoXLModel(config)
 
-    for name, pointer in tf_to_pt_map.items():
-        assert name in tf_weights
-        array = tf_weights[name]
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if 'kernel' in name or 'proj_W' in name:
-            array = np.transpose(array)
-        if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:
-            # Here we will split the TF weigths
-            assert len(pointer) == array.shape[0]
-            for i, p_i in enumerate(pointer):
-                arr_i = array[i, ...]
-                try:
-                    assert p_i.shape == arr_i.shape
-                except AssertionError as e:
-                    e.args += (p_i.shape, arr_i.shape)
-                    raise
-                print("Initialize PyTorch weight {} for layer {}".format(name, i))
-                p_i.data = torch.from_numpy(arr_i)
-            continue
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        print("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
+        # Build TF to PyTorch weights loading map
+        tf_to_pt_map = build_tf_to_pytorch_map(model.transformer, config)
 
-    # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
-    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
-    torch.save(model.state_dict(), pytorch_weights_dump_path)
-    print("Save configuration file to {}".format(pytorch_config_dump_path))
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(config.to_json_string())
+        # Load weights from TF model
+        init_vars = tf.train.list_variables(tf_path)
+        tf_weights = {}
+        for name, shape in init_vars:
+            print("Loading TF weight {} with shape {}".format(name, shape))
+            array = tf.train.load_variable(tf_path, name)
+            tf_weights[name] = array
+
+        for name, pointer in tf_to_pt_map.items():
+            assert name in tf_weights
+            array = tf_weights[name]
+            # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+            # which are not required for using pretrained model
+            if 'kernel' in name or 'proj_W' in name:
+                array = np.transpose(array)
+            if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:
+                # Here we will split the TF weigths
+                assert len(pointer) == array.shape[0]
+                for i, p_i in enumerate(pointer):
+                    arr_i = array[i, ...]
+                    try:
+                        assert p_i.shape == arr_i.shape
+                    except AssertionError as e:
+                        e.args += (p_i.shape, arr_i.shape)
+                        raise
+                    print("Initialize PyTorch weight {} for layer {}".format(name, i))
+                    p_i.data = torch.from_numpy(arr_i)
+                continue
+            try:
+                assert pointer.shape == array.shape
+            except AssertionError as e:
+                e.args += (pointer.shape, array.shape)
+                raise
+            print("Initialize PyTorch weight {}".format(name))
+            pointer.data = torch.from_numpy(array)
+
+        # Save pytorch-model
+        pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
+        pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+        print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
+        torch.save(model.state_dict(), pytorch_weights_dump_path)
+        print("Save configuration file to {}".format(pytorch_config_dump_path))
+        with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+            f.write(config.to_json_string())
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
-    parser.add_argument("--tf_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path the TensorFlow checkpoint path.")
     parser.add_argument("--pytorch_dump_folder_path",
                         default = None,
                         type = str,
                         required = True,
-                        help = "Path to the output PyTorch model.")
+                        help = "Path to the folder to store the PyTorch model or dataset/vocab.")
+    parser.add_argument("--tf_checkpoint_path",
+                        default = "",
+                        type = str,
+                        help = "An optional path to a TensorFlow checkpoint path to be converted.")
     parser.add_argument("--transfo_xl_config_file",
                         default = "",
                         type = str,
-                        help = "The config json file corresponding to the pre-trained BERT model. \n"
+                        help = "An optional config json file corresponding to the pre-trained BERT model. \n"
                             "This specifies the model architecture.")
+    parser.add_argument("--transfo_xl_dataset_file",
+                        default = "",
+                        type = str,
+                        help = "An optional dataset file to be converted in a vocabulary.")
     args = parser.parse_args()
     convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path,
                                      args.transfo_xl_config_file,
-                                     args.pytorch_dump_folder_path)
+                                     args.pytorch_dump_folder_path,
+                                     args.transfo_xl_dataset_file)
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index a7a9ca2e5b..5b80f045a4 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ PyTorch Transformer XL model.
-    Directly adapted from https://github.com/kimiyoung/transformer-xl.
+    Adapted from https://github.com/kimiyoung/transformer-xl.
     In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py
 """
 
@@ -40,7 +40,7 @@ from .file_utils import cached_path
 logger = logging.getLogger(__name__)
 
 PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'transfo-xl': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl.tar.gz",
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103.tar.gz",
 }
 CONFIG_NAME = 'transfo_xl_config.json'
 WEIGHTS_NAME = 'pytorch_model.bin'
@@ -59,12 +59,13 @@ class TransfoXLConfig(object):
                  div_val=4,
                  pre_lnorm=False,
                  n_layer=18,
-                 tgt_len=256,
+                 tgt_len=128,
                  ext_len=0,
-                 mem_len=256,
-                 same_length=False,
+                 mem_len=1600,
+                 clamp_len=1000,
+                 same_length=True,
+                 proj_share_all_but_first=True,
                  attn_type=0,
-                 clamp_len=-1,
                  sample_softmax=-1,
                  adaptive=True,
                  tie_weight=True,
@@ -93,6 +94,7 @@ class TransfoXLConfig(object):
             ext_len: length of the extended context
             mem_len: length of the retained previous heads
             same_length: use the same attn length for all tokens
+            proj_share_all_but_first: True to share all but first projs, False not to share.
             attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
             clamp_len: use the same pos embeddings after clamp_len
             sample_softmax: number of samples in sampled softmax
@@ -118,7 +120,10 @@ class TransfoXLConfig(object):
             self.cutoffs = []
             self.cutoffs.extend(cutoffs)
             self.tie_weight = tie_weight
-            self.tie_projs = [False] + [True] * len(self.cutoffs)
+            if proj_share_all_but_first:
+                self.tie_projs = [False] + [True] * len(self.cutoffs)
+            else:
+                self.tie_projs = [False] + [False] * len(self.cutoffs)
             self.d_model = d_model
             self.d_embed = d_embed
             self.d_head = d_head
@@ -423,7 +428,7 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
         r_head_k = r_head_k.view(rlen, self.n_head, self.d_head)                # qlen x n_head x d_head
 
         #### compute attention score
-        rw_head_q = w_head_q + self.r_w_bias                                         # qlen x bsz x n_head x d_head
+        rw_head_q = w_head_q + self.r_w_bias                                    # qlen x bsz x n_head x d_head
         AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k))             # qlen x klen x bsz x n_head
 
         rr_head_q = w_head_q + self.r_r_bias
@@ -915,21 +920,25 @@ class MemTransformerLM(nn.Module):
 
         return core_out, new_mems
 
-    def forward(self, data, target, *mems):
+    def forward(self, data, target=None, *mems):
         # nn.DataParallel does not allow size(0) tensors to be broadcasted.
         # So, have to initialize size(0) mems inside the model forward.
         # Moreover, have to return new_mems to allow nn.DataParallel to piece
         # them together.
         if not mems: mems = self.init_mems()
 
-        tgt_len = target.size(0)
         hidden, new_mems = self._forward(data, mems=mems)
+        if target is None:
+            if new_mems is None:
+                return [hidden]
+            else:
+                return [hidden] + new_mems
 
+        tgt_len = target.size(0)
         pred_hid = hidden[-tgt_len:]
         if self.sample_softmax > 0 and self.training:
             assert self.tie_weight
-            logit = sample_logits(self.word_emb,
-                self.out_layer.bias, target, pred_hid, self.sampler)
+            logit = sample_logits(self.word_emb, self.out_layer.bias, target, pred_hid, self.sampler)
             loss = -F.log_softmax(logit, -1)[:, :, 0]
         else:
             loss = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target.view(-1))
@@ -1010,7 +1019,7 @@ class TransfoXLPreTrainedModel(nn.Module):
         pass
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name, num_special_tokens=0, state_dict=None, cache_dir=None,
+    def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None,
                         *inputs, **kwargs):
         """
         Instantiate a TransfoXLPreTrainedModel from a pre-trained model file or a pytorch state dict.
@@ -1100,7 +1109,7 @@ class TransfoXLPreTrainedModel(nn.Module):
             for name, child in module._modules.items():
                 if child is not None:
                     load(child, prefix + name + '.')
-        load(model.transformer if hasattr(model, 'transformer') else model, prefix='')
+        # load(model.transformer if hasattr(model, 'transformer') else model, prefix='')
         if len(missing_keys) > 0:
             logger.info("Weights of {} not initialized from pretrained model: {}".format(
                 model.__class__.__name__, missing_keys))
@@ -1110,9 +1119,6 @@ class TransfoXLPreTrainedModel(nn.Module):
         if len(error_msgs) > 0:
             raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
                                model.__class__.__name__, "\n\t".join(error_msgs)))
-        # Add additional embeddings for special tokens if needed
-        if num_special_tokens != config.n_special:
-            model.set_num_special_tokens(num_special_tokens)
         if tempdir:
             # Clean up temp dir
             shutil.rmtree(tempdir)
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py
index 1d278abcb2..a411c267b9 100644
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@@ -14,15 +14,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Tokenization classes for Transformer XL model.
-    Directly adapted from https://github.com/kimiyoung/transformer-xl.
+    Adapted from https://github.com/kimiyoung/transformer-xl.
 """
 
 import os
-import re
-import json
-from tqdm import tqdm
+import glob
 import logging
 import pickle
+import torch
 from collections import Counter, OrderedDict
 
 from .file_utils import cached_path
@@ -30,16 +29,14 @@ from .file_utils import cached_path
 logger = logging.getLogger(__name__)
 
 PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    'transfo-xl': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
 }
-PRETRAINED_MERGES_ARCHIVE_MAP = {
-    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
+VOCAB_NAME = 'vocab.bin'
+
+PRETRAINED_CORPUS_ARCHIVE_MAP = {
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin",
 }
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
-    'openai-gpt': 512,
-}
-VOCAB_NAME = 'vocab.json'
-MERGES_NAME = 'merges.txt'
+CORPUS_NAME = 'corpus.bin'
 
 class TransfoXLTokenizer(object):
     """
@@ -49,43 +46,36 @@ class TransfoXLTokenizer(object):
     def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
         """
         Instantiate a TransfoXLTokenizer.
-        Download and cache the vocabulary if needed.
+        The TransfoXLTokenizer.
         """
         if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
             vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
         else:
             vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
-            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
         # redirect to the cache, if necessary
         try:
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
         except FileNotFoundError:
             logger.error(
                 "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "We assumed '{}' was a path or url but couldn't find files {} "
                 "at this path or url.".format(
                     pretrained_model_name_or_path,
                     ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
                     pretrained_model_name_or_path,
-                    vocab_file, merges_file))
+                    vocab_file))
             return None
-        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
+        if resolved_vocab_file == vocab_file:
             logger.info("loading vocabulary file {}".format(vocab_file))
-            logger.info("loading merges file {}".format(merges_file))
         else:
             logger.info("loading vocabulary file {} from cache at {}".format(
                 vocab_file, resolved_vocab_file))
-            logger.info("loading merges file {} from cache at {}".format(
-                merges_file, resolved_merges_file))
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
-            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
-            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+
         # Instantiate tokenizer.
-        tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
+        tokenizer = cls(*inputs, **kwargs)
+        vocab_dict = torch.load(resolved_vocab_file)
+        for key, value in vocab_dict.items():
+            tokenizer.__dict__[key] = value
         return tokenizer
 
     def __init__(self, special=[], min_freq=0, max_size=None, lower_case=True,
@@ -418,10 +408,53 @@ class LMMultiFileIterator(LMShuffledIterator):
                 yield batch
 
 
-class Corpus(object):
-    def __init__(self, path, dataset, *args, **kwargs):
+class TransfoXLCorpus(object):
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a pre-processed corpus.
+        """
+        vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        if pretrained_model_name_or_path in PRETRAINED_CORPUS_ARCHIVE_MAP:
+            corpus_file = PRETRAINED_CORPUS_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            corpus_file = os.path.join(pretrained_model_name_or_path, CORPUS_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir)
+        except FileNotFoundError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    corpus_file))
+            return None
+        if resolved_corpus_file == corpus_file:
+            logger.info("loading corpus file {}".format(corpus_file))
+        else:
+            logger.info("loading corpus file {} from cache at {}".format(
+                corpus_file, resolved_corpus_file))
+
+        # Instantiate tokenizer.
+        corpus = cls(*inputs, **kwargs)
+        corpus_dict = torch.load(resolved_corpus_file)
+        for key, value in corpus_dict.items():
+            corpus.__dict__[key] = value
+        corpus.vocab = vocab
+        return corpus
+
+    def __init__(self, *args, **kwargs):
+        self.vocab = TransfoXLTokenizer(*args, **kwargs)
+        self.dataset = None
+        self.train = None
+        self.valid = None
+        self.test = None
+
+    def build_corpus(self, path, dataset):
         self.dataset = dataset
-        self.vocab = Vocab(*args, **kwargs)
 
         if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']:
             self.vocab.count_file(os.path.join(path, 'train.txt'))
@@ -443,20 +476,20 @@ class Corpus(object):
                 os.path.join(path, 'train.txt'), ordered=True)
             self.valid = self.vocab.encode_file(
                 os.path.join(path, 'valid.txt'), ordered=True)
-            self.test  = self.vocab.encode_file(
+            self.test = self.vocab.encode_file(
                 os.path.join(path, 'test.txt'), ordered=True)
         elif self.dataset in ['enwik8', 'text8']:
             self.train = self.vocab.encode_file(
                 os.path.join(path, 'train.txt'), ordered=True, add_eos=False)
             self.valid = self.vocab.encode_file(
                 os.path.join(path, 'valid.txt'), ordered=True, add_eos=False)
-            self.test  = self.vocab.encode_file(
+            self.test = self.vocab.encode_file(
                 os.path.join(path, 'test.txt'), ordered=True, add_eos=False)
         elif self.dataset == 'lm1b':
             self.train = train_paths
             self.valid = self.vocab.encode_file(
                 os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True)
-            self.test  = self.vocab.encode_file(
+            self.test = self.vocab.encode_file(
                 os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True)
 
     def get_iterator(self, split, *args, **kwargs):
@@ -502,7 +535,7 @@ def get_lm_corpus(datadir, dataset):
         elif dataset in ['enwik8', 'text8']:
             pass
 
-        corpus = Corpus(datadir, dataset, **kwargs)
+        corpus = TransfoXLCorpus(datadir, dataset, **kwargs)
         torch.save(corpus, fn)
 
     return corpus

From bcd4aa8fe03b4391c55a19e0c46fdaa08e179b64 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 15 Jan 2019 23:32:34 +0100
Subject: [PATCH 11/82] update evaluation example

---
 examples/eval_transfo_xl.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/eval_transfo_xl.py b/examples/eval_transfo_xl.py
index 886e826b2c..15c2665782 100644
--- a/examples/eval_transfo_xl.py
+++ b/examples/eval_transfo_xl.py
@@ -44,18 +44,18 @@ parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model
 parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
                     choices=['transfo-xl-wt103'], #, 'lm1b', 'enwik8', 'text8'],
                     help='pretrained model name')
-parser.add_argument('--split', type=str, default='all',
+parser.add_argument('--split', type=str, default='test',
                     choices=['all', 'valid', 'test'],
                     help='which split to evaluate')
 parser.add_argument('--batch_size', type=int, default=10,
                     help='batch size')
-parser.add_argument('--tgt_len', type=int, default=5,
+parser.add_argument('--tgt_len', type=int, default=128,
                     help='number of tokens to predict')
 parser.add_argument('--ext_len', type=int, default=0,
                     help='length of the extended context')
-parser.add_argument('--mem_len', type=int, default=0,
+parser.add_argument('--mem_len', type=int, default=1600,
                     help='length of the retained previous heads')
-parser.add_argument('--clamp_len', type=int, default=-1,
+parser.add_argument('--clamp_len', type=int, default=1000,
                     help='max positional embedding index')
 parser.add_argument('--cuda', action='store_true',
                     help='use CUDA')

From 8831c6880390e84494b34fc14f938c8a1c9654eb Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 16 Jan 2019 10:31:16 +0100
Subject: [PATCH 12/82] fixing various parts of model conversion, loading and
 weights sharing

---
 examples/eval_transfo_xl.py                   |   2 +-
 ...onvert_transfo_xl_checkpoint_to_pytorch.py |   5 +-
 .../modeling_transfo_xl.py                    | 513 ++++++++----------
 .../tokenization_transfo_xl.py                |   6 +
 4 files changed, 243 insertions(+), 283 deletions(-)

diff --git a/examples/eval_transfo_xl.py b/examples/eval_transfo_xl.py
index 15c2665782..92979d1e4a 100644
--- a/examples/eval_transfo_xl.py
+++ b/examples/eval_transfo_xl.py
@@ -42,7 +42,7 @@ parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model
 # parser.add_argument('--data', type=str, default='../data/wikitext-103',
 #                     help='location of the data corpus')
 parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
-                    choices=['transfo-xl-wt103'], #, 'lm1b', 'enwik8', 'text8'],
+                    # choices=['transfo-xl-wt103'], #, 'lm1b', 'enwik8', 'text8'],
                     help='pretrained model name')
 parser.add_argument('--split', type=str, default='test',
                     choices=['all', 'valid', 'test'],
diff --git a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
index 6962481adc..b2f8432d3a 100755
--- a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -116,7 +116,8 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
         # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
         pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME
         print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
-        torch.save(corpus.vocab.__dict__, pytorch_vocab_dump_path)
+        corpus_vocab_dict = corpus.vocab.__dict__
+        torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
 
         corpus_dict_no_vocab = corpus.__dict__
         corpus_dict_no_vocab.pop('vocab', None)
@@ -139,7 +140,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
         model = TransfoXLModel(config)
 
         # Build TF to PyTorch weights loading map
-        tf_to_pt_map = build_tf_to_pytorch_map(model.transformer, config)
+        tf_to_pt_map = build_tf_to_pytorch_map(model, config)
 
         # Load weights from TF model
         init_vars = tf.train.list_variables(tf_path)
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 5b80f045a4..0e1f3f8240 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -30,6 +30,7 @@ import collections
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
@@ -40,7 +41,10 @@ from .file_utils import cached_path
 logger = logging.getLogger(__name__)
 
 PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103.tar.gz",
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin",
+}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-transfo_xl_config.json",
 }
 CONFIG_NAME = 'transfo_xl_config.json'
 WEIGHTS_NAME = 'pytorch_model.bin'
@@ -674,99 +678,266 @@ class AdaptiveEmbedding(nn.Module):
 
         return embed
 
-class MemTransformerLM(nn.Module):
-    def __init__(self, n_token, n_layer, n_head, d_model, d_head, d_inner,
-                 dropout, dropatt, tie_weight=True, d_embed=None, 
-                 div_val=1, tie_projs=[False], pre_lnorm=False,
-                 tgt_len=None, ext_len=None, mem_len=None, 
-                 cutoffs=[], adapt_inp=False, untie_r=False,
-                 same_length=False, attn_type=0, clamp_len=-1, 
-                 sample_softmax=-1, **kwargs):
-        super(MemTransformerLM, self).__init__()
-        self.n_token = n_token
 
-        d_embed = d_model if d_embed is None else d_embed
-        self.d_embed = d_embed
-        self.d_model = d_model
-        self.n_head = n_head
-        self.d_head = d_head
+class TransfoXLPreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TransfoXLPreTrainedModel, self).__init__()
+        if not isinstance(config, TransfoXLConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `TransfoXLConfig`. "
+                "To create a model from a pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        self.config = config
 
-        self.word_emb = AdaptiveEmbedding(n_token, d_embed, d_model, cutoffs, 
-                                          div_val=div_val)
+    def init_weight(self, weight):
+        if self.config.init == 'uniform':
+            nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
+        elif self.config.init == 'normal':
+            nn.init.normal_(weight, 0.0, self.config.init_std)
 
-        self.drop = nn.Dropout(dropout)
+    def init_bias(self, bias):
+        nn.init.constant_(bias, 0.0)
 
-        self.n_layer = n_layer
+    def init_weights(self, m):
+        """ Initialize the weights.
+        """
+        classname = m.__class__.__name__
+        if classname.find('Linear') != -1:
+            if hasattr(m, 'weight') and m.weight is not None:
+                self.init_weight(m.weight)
+            if hasattr(m, 'bias') and m.bias is not None:
+                self.init_bias(m.bias)
+        elif classname.find('AdaptiveEmbedding') != -1:
+            if hasattr(m, 'emb_projs'):
+                for i in range(len(m.emb_projs)):
+                    if m.emb_projs[i] is not None:
+                        nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std)
+        elif classname.find('Embedding') != -1:
+            if hasattr(m, 'weight'):
+                self.init_weight(m.weight)
+        elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
+            if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
+                self.init_weight(m.cluster_weight)
+            if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
+                self.init_bias(m.cluster_bias)
+            if hasattr(m, 'out_projs'):
+                for i in range(len(m.out_projs)):
+                    if m.out_projs[i] is not None:
+                        nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std)
+        elif classname.find('LayerNorm') != -1:
+            if hasattr(m, 'weight'):
+                nn.init.normal_(m.weight, 1.0, self.config.init_std)
+            if hasattr(m, 'bias') and m.bias is not None:
+                self.init_bias(m.bias)
+        elif classname.find('TransformerLM') != -1:
+            if hasattr(m, 'r_emb'):
+                self.init_weight(m.r_emb)
+            if hasattr(m, 'r_w_bias'):
+                self.init_weight(m.r_w_bias)
+            if hasattr(m, 'r_r_bias'):
+                self.init_weight(m.r_r_bias)
+            if hasattr(m, 'r_bias'):
+                self.init_bias(m.r_bias)
 
-        self.tgt_len = tgt_len
-        self.mem_len = mem_len
-        self.ext_len = ext_len
-        self.max_klen = tgt_len + ext_len + mem_len
+    def set_num_special_tokens(self, num_special_tokens):
+        pass
 
-        self.attn_type = attn_type
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None,
+                        *inputs, **kwargs):
+        """
+        Instantiate a TransfoXLPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
 
-        if not untie_r:
+        Params:
+            pretrained_model_name_or_path: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `transfo-xl`
+                - a path or url to a pretrained model archive containing:
+                    . `transfo_xl_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """
+        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
+            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except FileNotFoundError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                    pretrained_model_name_or_path,
+                    archive_file, config_file))
+            return None
+        if resolved_archive_file == archive_file and resolved_config_file == config_file:
+            logger.info("loading weights file {}".format(archive_file))
+            logger.info("loading configuration file {}".format(config_file))
+        else:
+            logger.info("loading weights file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+        # Load config
+        config = TransfoXLConfig.from_json_file(resolved_config_file)
+        logger.info("Model config {}".format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None:
+            state_dict = torch.load(resolved_archive_file)
+
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+        # load(model.transformer if hasattr(model, 'transformer') else model, prefix='')
+        if len(missing_keys) > 0:
+            logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            logger.info("Weights from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               model.__class__.__name__, "\n\t".join(error_msgs)))
+        return model
+
+
+class TransfoXLModel(TransfoXLPreTrainedModel):
+    def __init__(self, config):
+    # n_token, n_layer, n_head, d_model, d_head, d_inner,
+    #              dropout, dropatt, tie_weight=True, d_embed=None, 
+    #              div_val=1, tie_projs=[False], pre_lnorm=False,
+    #              tgt_len=None, ext_len=None, mem_len=None, 
+    #              cutoffs=[], adapt_inp=False, untie_r=False,
+    #              same_length=False, attn_type=0, clamp_len=-1, 
+    #              sample_softmax=-1, **kwargs):
+        super(TransfoXLModel, self).__init__(config)
+        self.n_token = config.n_token
+
+        self.d_embed = config.d_embed
+        self.d_model = config.d_model
+        self.n_head = config.n_head
+        self.d_head = config.d_head
+
+        self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, 
+                                          div_val=config.div_val)
+
+        self.drop = nn.Dropout(config.dropout)
+
+        self.n_layer = config.n_layer
+
+        self.tgt_len = config.tgt_len
+        self.mem_len = config.mem_len
+        self.ext_len = config.ext_len
+        self.max_klen = config.tgt_len + config.ext_len + config.mem_len
+
+        self.attn_type = config.attn_type
+
+        if not config.untie_r:
             self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
             self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
 
         self.layers = nn.ModuleList()
-        if attn_type == 0: # the default attention
-            for i in range(n_layer):
+        if config.attn_type == 0: # the default attention
+            for i in range(config.n_layer):
                 self.layers.append(
                     RelPartialLearnableDecoderLayer(
-                        n_head, d_model, d_head, d_inner, dropout,
-                        tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len,
-                        dropatt=dropatt, pre_lnorm=pre_lnorm,
-                        r_w_bias=None if untie_r else self.r_w_bias,
-                        r_r_bias=None if untie_r else self.r_r_bias)
+                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
+                        tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
+                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
+                        r_w_bias=None if config.untie_r else self.r_w_bias,
+                        r_r_bias=None if config.untie_r else self.r_r_bias)
                 )
-        elif attn_type == 1: # learnable embeddings
-            for i in range(n_layer):
+        elif config.attn_type == 1: # learnable embeddings
+            for i in range(config.n_layer):
                 self.layers.append(
                     RelLearnableDecoderLayer(
-                        n_head, d_model, d_head, d_inner, dropout,
-                        tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len,
-                        dropatt=dropatt, pre_lnorm=pre_lnorm,
-                        r_w_bias=None if untie_r else self.r_w_bias,
-                        r_r_bias=None if untie_r else self.r_r_bias)
+                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
+                        tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
+                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
+                        r_w_bias=None if config.untie_r else self.r_w_bias,
+                        r_r_bias=None if config.untie_r else self.r_r_bias)
                 )
-        elif attn_type in [2, 3]: # absolute embeddings
-            for i in range(n_layer):
+        elif config.attn_type in [2, 3]: # absolute embeddings
+            for i in range(config.n_layer):
                 self.layers.append(
                     DecoderLayer(
-                        n_head, d_model, d_head, d_inner, dropout,
-                        dropatt=dropatt, pre_lnorm=pre_lnorm,
-                        r_w_bias=None if untie_r else self.r_w_bias,
-                        r_r_bias=None if untie_r else self.r_r_bias)
+                        config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
+                        dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
+                        r_w_bias=None if config.untie_r else self.r_w_bias,
+                        r_r_bias=None if config.untie_r else self.r_r_bias)
                 )
 
-        self.sample_softmax = sample_softmax
+        self.sample_softmax = config.sample_softmax
         # use sampled softmax
-        if sample_softmax > 0:
-            self.out_layer = nn.Linear(d_model, n_token)
-            if tie_weight:
+        if config.sample_softmax > 0:
+            self.out_layer = nn.Linear(config.d_model, config.n_token)
+            if config.tie_weight:
                 self.out_layer.weight = self.word_emb.weight
-            self.tie_weight = tie_weight
-            self.sampler = LogUniformSampler(n_token, sample_softmax)
+            self.tie_weight = config.tie_weight
+            self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
 
         # use adaptive softmax (including standard softmax)
         else:
-            self.crit = ProjectedAdaptiveLogSoftmax(n_token, d_embed, d_model, 
-                                                    cutoffs, div_val=div_val)
+            self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model, 
+                                                    config.cutoffs, div_val=config.div_val)
 
-            if tie_weight:
+            if config.tie_weight:
                 for i in range(len(self.crit.out_layers)):
                     self.crit.out_layers[i].weight = self.word_emb.emb_layers[i].weight
 
-            if tie_projs:
-                for i, tie_proj in enumerate(tie_projs):
-                    if tie_proj and div_val == 1 and d_model != d_embed:
+            if config.tie_projs:
+                for i, tie_proj in enumerate(config.tie_projs):
+                    if tie_proj and config.div_val == 1 and config.d_model != config.d_embed:
                         self.crit.out_projs[i] = self.word_emb.emb_projs[0]
-                    elif tie_proj and div_val != 1:
+                    elif tie_proj and config.div_val != 1:
                         self.crit.out_projs[i] = self.word_emb.emb_projs[i]
 
-        self.same_length = same_length
-        self.clamp_len = clamp_len
+        self.same_length = config.same_length
+        self.clamp_len = config.clamp_len
 
         if self.attn_type == 0: # default attention
             self.pos_emb = PositionalEmbedding(self.d_model)
@@ -859,8 +1030,7 @@ class MemTransformerLM(nn.Module):
             hids.append(core_out)
             for i, layer in enumerate(self.layers):
                 mems_i = None if mems is None else mems[i]
-                core_out = layer(core_out, pos_emb, self.r_w_bias,
-                        self.r_r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i)
+                core_out = layer(core_out, pos_emb, dec_attn_mask=dec_attn_mask, mems=mems_i)
                 hids.append(core_out)
         elif self.attn_type == 1: # learnable
             core_out = self.drop(word_emb)
@@ -949,220 +1119,3 @@ class MemTransformerLM(nn.Module):
         else:
             return [loss] + new_mems
 
-
-class TransfoXLPreTrainedModel(nn.Module):
-    """ An abstract class to handle weights initialization and
-        a simple interface for dowloading and loading pretrained models.
-    """
-    def __init__(self, config, *inputs, **kwargs):
-        super(TransfoXLPreTrainedModel, self).__init__()
-        if not isinstance(config, TransfoXLConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `TransfoXLConfig`. "
-                "To create a model from a pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                ))
-        self.config = config
-
-    def init_weight(self, weight):
-        if self.config.init == 'uniform':
-            nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
-        elif self.config.init == 'normal':
-            nn.init.normal_(weight, 0.0, self.config.init_std)
-
-    def init_bias(self, bias):
-        nn.init.constant_(bias, 0.0)
-
-    def init_weights(self, m):
-        """ Initialize the weights.
-        """
-        classname = m.__class__.__name__
-        if classname.find('Linear') != -1:
-            if hasattr(m, 'weight') and m.weight is not None:
-                self.init_weight(m.weight)
-            if hasattr(m, 'bias') and m.bias is not None:
-                self.init_bias(m.bias)
-        elif classname.find('AdaptiveEmbedding') != -1:
-            if hasattr(m, 'emb_projs'):
-                for i in range(len(m.emb_projs)):
-                    if m.emb_projs[i] is not None:
-                        nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std)
-        elif classname.find('Embedding') != -1:
-            if hasattr(m, 'weight'):
-                self.init_weight(m.weight)
-        elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
-            if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
-                self.init_weight(m.cluster_weight)
-            if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
-                self.init_bias(m.cluster_bias)
-            if hasattr(m, 'out_projs'):
-                for i in range(len(m.out_projs)):
-                    if m.out_projs[i] is not None:
-                        nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std)
-        elif classname.find('LayerNorm') != -1:
-            if hasattr(m, 'weight'):
-                nn.init.normal_(m.weight, 1.0, self.config.init_std)
-            if hasattr(m, 'bias') and m.bias is not None:
-                self.init_bias(m.bias)
-        elif classname.find('TransformerLM') != -1:
-            if hasattr(m, 'r_emb'):
-                self.init_weight(m.r_emb)
-            if hasattr(m, 'r_w_bias'):
-                self.init_weight(m.r_w_bias)
-            if hasattr(m, 'r_r_bias'):
-                self.init_weight(m.r_r_bias)
-            if hasattr(m, 'r_bias'):
-                self.init_bias(m.r_bias)
-
-    def set_num_special_tokens(self, num_special_tokens):
-        pass
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None,
-                        *inputs, **kwargs):
-        """
-        Instantiate a TransfoXLPreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                    . `transfo-xl`
-                - a path or url to a pretrained model archive containing:
-                    . `transfo_xl_config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific Bert class
-                (ex: num_labels for BertForSequenceClassification)
-        """
-        if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
-            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
-        else:
-            archive_file = pretrained_model_name
-        # redirect to the cache, if necessary
-        try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-        except FileNotFoundError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name,
-                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
-                    archive_file))
-            return None
-        if resolved_archive_file == archive_file:
-            logger.info("loading archive file {}".format(archive_file))
-        else:
-            logger.info("loading archive file {} from cache at {}".format(
-                archive_file, resolved_archive_file))
-        tempdir = None
-        if os.path.isdir(resolved_archive_file):
-            serialization_dir = resolved_archive_file
-        else:
-            # Extract archive to temp dir
-            tempdir = tempfile.mkdtemp()
-            logger.info("extracting archive file {} to temp dir {}".format(
-                resolved_archive_file, tempdir))
-            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
-                archive.extractall(tempdir)
-            serialization_dir = tempdir
-        # Load config
-        config_file = os.path.join(serialization_dir, CONFIG_NAME)
-        config = TransfoXLConfig.from_json_file(config_file)
-        logger.info("Model config {}".format(config))
-        # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
-        if state_dict is None:
-            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
-            state_dict = torch.load(weights_path)
-
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if 'gamma' in key:
-                new_key = key.replace('gamma', 'weight')
-            if 'beta' in key:
-                new_key = key.replace('beta', 'bias')
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, '_metadata', None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        def load(module, prefix=''):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + '.')
-        # load(model.transformer if hasattr(model, 'transformer') else model, prefix='')
-        if len(missing_keys) > 0:
-            logger.info("Weights of {} not initialized from pretrained model: {}".format(
-                model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
-            logger.info("Weights from pretrained model not used in {}: {}".format(
-                model.__class__.__name__, unexpected_keys))
-        if len(error_msgs) > 0:
-            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                               model.__class__.__name__, "\n\t".join(error_msgs)))
-        if tempdir:
-            # Clean up temp dir
-            shutil.rmtree(tempdir)
-        return model
-
-
-class TransfoXLModel(TransfoXLPreTrainedModel):
-    """ Transformer XL model
-        From "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
-        by Zihang Dai*, Zhilin Yang*, Yiming Yang, William W. Cohen, Jaime Carbonell,
-           Quoc V. Le, Ruslan Salakhutdinov (*: equal contribution)
-
-    Params:
-        config: a TransfoXLConfig class instance with the configuration to build a new model
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_ctx - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third embedding (the previous two being the word and position embeddings)
-            to each token in the sentence.
-
-    Outputs:
-        `hidden_states`: the encoded-hidden-states at the top of the model
-            as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
-            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
-
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-
-    config = modeling_transfo_xl.TransfoXLConfig()
-
-    model = modeling_transfo_xl.TransfoXLModel(config)
-    hidden_states = model(input_ids)
-    ```
-    """
-    def __init__(self, config):
-        super(TransfoXLModel, self).__init__(config)
-        self.transformer = MemTransformerLM(**config.to_dict())
-        self.apply(self.init_weights)
-
-    def forward(self, input_ids, position_ids=None, token_type_ids=None):
-        return self.transformer(input_ids, position_ids, token_type_ids)
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py
index a411c267b9..db626f7755 100644
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@@ -444,6 +444,12 @@ class TransfoXLCorpus(object):
         for key, value in corpus_dict.items():
             corpus.__dict__[key] = value
         corpus.vocab = vocab
+        if corpus.train is not None:
+            corpus.train = torch.tensor(corpus.train, dtype=torch.long)
+        if corpus.valid is not None:
+            corpus.valid = torch.tensor(corpus.valid, dtype=torch.long)
+        if corpus.test is not None:
+            corpus.test = torch.tensor(corpus.test, dtype=torch.long)
         return corpus
 
     def __init__(self, *args, **kwargs):

From c03c12687fc400c485420e18f0c59fea176a4a9e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 16 Jan 2019 10:55:22 +0100
Subject: [PATCH 13/82] fix __main__ entry script

---
 pytorch_pretrained_bert/__main__.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pytorch_pretrained_bert/__main__.py b/pytorch_pretrained_bert/__main__.py
index 731d87f26a..1869ff2ee2 100644
--- a/pytorch_pretrained_bert/__main__.py
+++ b/pytorch_pretrained_bert/__main__.py
@@ -10,7 +10,7 @@ def main():
         "Should be used as"
         "`pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
         "`pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]` or \n"
-        "`pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+        "`pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
     else:
         if sys.argv[1] == "convert_tf_checkpoint_to_pytorch":
             try:
@@ -49,13 +49,17 @@ def main():
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
-            TF_CHECKPOINT = sys.argv[2]
+            if 'ckpt' in sys.argv[2].lower():
+                TF_CHECKPOINT = sys.argv[2]
+                TF_DATASET_FILE = ""
+            else:
+                TF_DATASET_FILE = sys.argv[2]
+                TF_CHECKPOINT = ""
             PYTORCH_DUMP_OUTPUT = sys.argv[3]
             if len(sys.argv) == 5:
                 TF_CONFIG = sys.argv[4]
             else:
                 TF_CONFIG = ""
-            convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
-
+            convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
 if __name__ == '__main__':
     main()

From a28dfc86599b975df845a8ca9331c87af301c6b1 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 16 Jan 2019 11:18:19 +0100
Subject: [PATCH 14/82] fix eval for wt103

---
 examples/eval_transfo_xl.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/eval_transfo_xl.py b/examples/eval_transfo_xl.py
index 92979d1e4a..e67efd3a68 100644
--- a/examples/eval_transfo_xl.py
+++ b/examples/eval_transfo_xl.py
@@ -132,12 +132,12 @@ elif args.split == 'test':
     valid_loss = None
 
 def format_log(loss, split):
-    if args.dataset in ['enwik8', 'text8']:
-        log_str = '| {0} loss {1:5.2f} | {0} bpc {2:9.5f} '.format(
-            split, loss, loss / math.log(2))
-    else:
-        log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
-            split, loss, math.exp(loss))
+    # if args.dataset in ['enwik8', 'text8']:
+    #     log_str = '| {0} loss {1:5.2f} | {0} bpc {2:9.5f} '.format(
+    #         split, loss, loss / math.log(2))
+    # else:
+    log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
+        split, loss, math.exp(loss))
     return log_str
 
 log_str = ''

From fea15cc9f5939bbd1cb162921ae273da9de49c14 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 16 Jan 2019 11:54:54 +0100
Subject: [PATCH 15/82] update model conversion

---
 ...onvert_transfo_xl_checkpoint_to_pytorch.py | 24 ++++++++++++-------
 .../modeling_transfo_xl.py                    | 14 -----------
 2 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
index b2f8432d3a..5b8ba99678 100755
--- a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -68,7 +68,10 @@ def build_tf_to_pytorch_map(model, config):
             layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
         })
 
-    # Softmax cutoffs
+    # Adaptive Softmax
+    tf_to_pt_map.update({
+        "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight,
+        "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias})
     for i, (out_l, proj_l, tie_proj) in enumerate(zip(
                             model.crit.out_layers,
                             model.crit.out_projs,
@@ -169,14 +172,17 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
                         raise
                     print("Initialize PyTorch weight {} for layer {}".format(name, i))
                     p_i.data = torch.from_numpy(arr_i)
-                continue
-            try:
-                assert pointer.shape == array.shape
-            except AssertionError as e:
-                e.args += (pointer.shape, array.shape)
-                raise
-            print("Initialize PyTorch weight {}".format(name))
-            pointer.data = torch.from_numpy(array)
+            else:
+                try:
+                    assert pointer.shape == array.shape
+                except AssertionError as e:
+                    e.args += (pointer.shape, array.shape)
+                    raise
+                print("Initialize PyTorch weight {}".format(name))
+                pointer.data = torch.from_numpy(array)
+            del tf_weights[name]
+
+        print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
 
         # Save pytorch-model
         pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 0e1f3f8240..de0430e964 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -802,20 +802,6 @@ class TransfoXLPreTrainedModel(nn.Module):
         if state_dict is None:
             state_dict = torch.load(resolved_archive_file)
 
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if 'gamma' in key:
-                new_key = key.replace('gamma', 'weight')
-            if 'beta' in key:
-                new_key = key.replace('beta', 'bias')
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
-
         missing_keys = []
         unexpected_keys = []
         error_msgs = []

From 009101de12d5cb26e8cc0f1f6af9216b11c6a50f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 16 Jan 2019 12:16:20 +0100
Subject: [PATCH 16/82] fix loading bug and check full conversion of model

---
 .../convert_transfo_xl_checkpoint_to_pytorch.py      | 12 +++++++-----
 pytorch_pretrained_bert/modeling_transfo_xl.py       |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
index 5b8ba99678..223bbec963 100755
--- a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -180,16 +180,18 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
                     raise
                 print("Initialize PyTorch weight {}".format(name))
                 pointer.data = torch.from_numpy(array)
-            del tf_weights[name]
+            tf_weights.pop(name, None)
+            tf_weights.pop(name + '/Adam', None)
+            tf_weights.pop(name + '/Adam_1', None)
 
         print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
 
         # Save pytorch-model
-        pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
-        pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
-        print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
+        pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
+        pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
+        print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
         torch.save(model.state_dict(), pytorch_weights_dump_path)
-        print("Save configuration file to {}".format(pytorch_config_dump_path))
+        print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
         with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
             f.write(config.to_json_string())
 
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index de0430e964..452f2e03ea 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -818,7 +818,7 @@ class TransfoXLPreTrainedModel(nn.Module):
             for name, child in module._modules.items():
                 if child is not None:
                     load(child, prefix + name + '.')
-        # load(model.transformer if hasattr(model, 'transformer') else model, prefix='')
+        load(model, prefix='')
         if len(missing_keys) > 0:
             logger.info("Weights of {} not initialized from pretrained model: {}".format(
                 model.__class__.__name__, missing_keys))

From b9c77b98d553119afcf46eb073a895979f1c26de Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 17 Jan 2019 00:33:21 +0100
Subject: [PATCH 17/82] fix transposition in model conversion and memory
 initialization

---
 .../convert_transfo_xl_checkpoint_to_pytorch.py       |  4 +---
 pytorch_pretrained_bert/modeling_transfo_xl.py        | 11 ++++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
index 223bbec963..eb6b8183ef 100755
--- a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -93,8 +93,6 @@ def build_tf_to_pytorch_map(model, config):
 
     # Relative positioning biases
     if config.untie_r:
-        layer_str = "transformer/r_r_bias"
-        layer_str_2 = "transformer/r_w_bias"
         r_r_list = []
         r_w_list = []
         for b in model.layers:
@@ -158,7 +156,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
             array = tf_weights[name]
             # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
             # which are not required for using pretrained model
-            if 'kernel' in name or 'proj_W' in name:
+            if 'kernel' in name or 'proj' in name:
                 array = np.transpose(array)
             if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:
                 # Here we will split the TF weigths
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 452f2e03ea..f80afffbc4 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -447,10 +447,10 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
         if attn_mask is not None and attn_mask.any().item():
             if attn_mask.dim() == 2:
                 attn_score = attn_score.float().masked_fill(
-                    attn_mask[None,:,:,None], -float('inf')).type_as(attn_score)
+                    attn_mask[None,:,:,None], -1e30).type_as(attn_score)
             elif attn_mask.dim() == 3:
                 attn_score = attn_score.float().masked_fill(
-                    attn_mask[:,:,:,None], -float('inf')).type_as(attn_score)
+                    attn_mask[:,:,:,None], -1e30).type_as(attn_score)
 
         # [qlen x klen x bsz x n_head]
         attn_prob = F.softmax(attn_score, dim=1)
@@ -947,12 +947,13 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         self.mem_len = mem_len
         self.ext_len = ext_len
 
-    def init_mems(self):
+    def init_mems(self, data):
         if self.mem_len > 0:
             mems = []
             param = next(self.parameters())
             for i in range(self.n_layer+1):
-                empty = torch.empty(0, dtype=param.dtype, device=param.device)
+                empty = torch.zeros(self.mem_len, data.size(1), self.config.d_model,
+                                    dtype=param.dtype, device=param.device)
                 mems.append(empty)
 
             return mems
@@ -1081,7 +1082,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         # So, have to initialize size(0) mems inside the model forward.
         # Moreover, have to return new_mems to allow nn.DataParallel to piece
         # them together.
-        if not mems: mems = self.init_mems()
+        if not mems: mems = self.init_mems(data)
 
         hidden, new_mems = self._forward(data, mems=mems)
         if target is None:

From 9c35c132fa3d207bed720555b2334ceeb1d19b25 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 17 Jan 2019 09:19:19 +0100
Subject: [PATCH 18/82] apex LayerNorm

---
 pytorch_pretrained_bert/modeling_transfo_xl.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index f80afffbc4..ba8994fd8a 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -217,7 +217,7 @@ class PositionwiseFF(nn.Module):
             nn.Dropout(dropout),
         )
 
-        self.layer_norm = nn.LayerNorm(d_model)
+        self.layer_norm = LayerNorm(d_model)
 
         self.pre_lnorm = pre_lnorm
 
@@ -254,7 +254,7 @@ class MultiHeadAttn(nn.Module):
         self.dropatt = nn.Dropout(dropatt)
         self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
 
-        self.layer_norm = nn.LayerNorm(d_model)
+        self.layer_norm = LayerNorm(d_model)
 
         self.scale = 1 / (d_head ** 0.5)
 
@@ -335,7 +335,7 @@ class RelMultiHeadAttn(nn.Module):
         self.dropatt = nn.Dropout(dropatt)
         self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
 
-        self.layer_norm = nn.LayerNorm(d_model)
+        self.layer_norm = LayerNorm(d_model)
 
         self.scale = 1 / (d_head ** 0.5)
 

From d77dd62ff823d788b7e635e57b6572f204e83264 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 28 Jan 2019 16:50:23 +0100
Subject: [PATCH 19/82] directly load from TF checkpoints + code cleanup

---
 pytorch_pretrained_bert/__init__.py           |   6 +
 .../convert_openai_checkpoint_to_pytorch.py   |  58 +++---
 .../convert_tf_checkpoint_to_pytorch.py       |  29 +--
 ...onvert_transfo_xl_checkpoint_to_pytorch.py |  94 +++++-----
 pytorch_pretrained_bert/modeling.py           |  25 ++-
 pytorch_pretrained_bert/modeling_openai.py    | 173 ++++++++++--------
 .../modeling_transfo_xl.py                    |  15 +-
 .../tokenization_openai.py                    |   3 +
 8 files changed, 225 insertions(+), 178 deletions(-)

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index 85f2422af6..249607bded 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -2,6 +2,7 @@ __version__ = "0.5.0"
 from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
+
 from .modeling import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
                        BertForSequenceClassification, BertForMultipleChoice,
@@ -9,6 +10,11 @@ from .modeling import (BertConfig, BertModel, BertForPreTraining,
 from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
                               OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
 from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel)
+
 from .optimization import BertAdam
 from .optimization_openai import OpenAIAdam
+
+from .convert_openai_checkpoint_to_pytorch import load_tf_weights_in_openai_gpt
+from .convert_tf_checkpoint_to_pytorch import load_tf_weights_in_bert
+from .convert_transfo_xl_checkpoint_to_pytorch import load_tf_weights_in_transfo_xl
 from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
diff --git a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
index 0c41741d9a..40740083d0 100755
--- a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
@@ -26,9 +26,29 @@ import numpy as np
 
 from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel, CONFIG_NAME, WEIGHTS_NAME
 
-
 def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
-    # Load weights from TF model
+    # Construct model
+    if openai_config_file == "":
+        config = OpenAIGPTConfig()
+    else:
+        config = OpenAIGPTConfig(openai_config_file)
+    model = OpenAIGPTModel(config)
+
+    # Load weights from numpy
+    load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path)
+
+    # Save pytorch-model
+    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
+    torch.save(model.state_dict(), pytorch_weights_dump_path)
+    print("Save configuration file to {}".format(pytorch_config_dump_path))
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(config.to_json_string())
+
+def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
+    """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
+    """
     print("Loading weights...")
     names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
     shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
@@ -36,35 +56,11 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c
     init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
     init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
     init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
-    # if n_ctx > 0:
-    #     init_params[0] = init_params[0][:n_ctx]
-    # if n_special > 0:
-    #     init_params[0] = np.concatenate(
-    #         [init_params[1],
-    #          (np.random.randn(n_special, n_embd) * 0.02).astype(np.float32),
-    #          init_params[0]
-    #          ], 0)
-    # else:
-    #     init_params[0] = np.concatenate(
-    #         [init_params[1],
-    #          init_params[0]
-    #          ], 0)
-    # del init_params[1]
-    # if n_transfer == -1:
-    #     n_transfer = 0
-    # else:
-    #     n_transfer = 1 + n_transfer * 12
 
     init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
     del init_params[1]
     init_params = [arr.squeeze() for arr in init_params]
 
-    # Construct model
-    if openai_config_file == "":
-        config = OpenAIGPTConfig()
-    else:
-        config = OpenAIGPTConfig(openai_config_file)
-    model = OpenAIGPTModel(config)
     try:
         assert model.embed.weight.shape == init_params[0].shape
     except AssertionError as e:
@@ -109,15 +105,7 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c
             raise
         print("Initialize PyTorch weight {}".format(name))
         pointer.data = torch.from_numpy(array)
-
-    # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
-    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
-    torch.save(model.state_dict(), pytorch_weights_dump_path)
-    print("Save configuration file to {}".format(pytorch_config_dump_path))
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(config.to_json_string())
+    return model
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
index 120624bc1b..74622bbb70 100755
--- a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
@@ -28,9 +28,23 @@ import numpy as np
 from .modeling import BertConfig, BertForPreTraining
 
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
-    config_path = os.path.abspath(bert_config_file)
+    # Initialise PyTorch model
+    config = BertConfig.from_json_file(bert_config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = BertForPreTraining(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_bert(model, tf_checkpoint_path)
+
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    torch.save(model.state_dict(), pytorch_dump_path)
+
+def load_tf_weights_in_bert(model, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model
+    """
     tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {} with config at {}".format(tf_path, config_path))
+    print("Converting TensorFlow checkpoint from {}".format(tf_path))
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
     names = []
@@ -41,11 +55,6 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor
         names.append(name)
         arrays.append(array)
 
-    # Initialise PyTorch model
-    config = BertConfig.from_json_file(bert_config_file)
-    print("Building PyTorch model from configuration: {}".format(str(config)))
-    model = BertForPreTraining(config)
-
     for name, array in zip(names, arrays):
         name = name.split('/')
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
@@ -81,11 +90,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor
             raise
         print("Initialize PyTorch weight {}".format(name))
         pointer.data = torch.from_numpy(array)
-
-    # Save pytorch-model
-    print("Save PyTorch model to {}".format(pytorch_dump_path))
-    torch.save(model.state_dict(), pytorch_dump_path)
-
+    return model
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
index eb6b8183ef..4dbb2067d6 100755
--- a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -106,7 +106,6 @@ def build_tf_to_pytorch_map(model, config):
         'transformer/r_w_bias': r_w_list})
     return tf_to_pt_map
 
-
 def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
                                              transfo_xl_config_file,
                                              pytorch_dump_folder_path,
@@ -140,50 +139,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
         print("Building PyTorch model from configuration: {}".format(str(config)))
         model = TransfoXLModel(config)
 
-        # Build TF to PyTorch weights loading map
-        tf_to_pt_map = build_tf_to_pytorch_map(model, config)
-
-        # Load weights from TF model
-        init_vars = tf.train.list_variables(tf_path)
-        tf_weights = {}
-        for name, shape in init_vars:
-            print("Loading TF weight {} with shape {}".format(name, shape))
-            array = tf.train.load_variable(tf_path, name)
-            tf_weights[name] = array
-
-        for name, pointer in tf_to_pt_map.items():
-            assert name in tf_weights
-            array = tf_weights[name]
-            # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-            # which are not required for using pretrained model
-            if 'kernel' in name or 'proj' in name:
-                array = np.transpose(array)
-            if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:
-                # Here we will split the TF weigths
-                assert len(pointer) == array.shape[0]
-                for i, p_i in enumerate(pointer):
-                    arr_i = array[i, ...]
-                    try:
-                        assert p_i.shape == arr_i.shape
-                    except AssertionError as e:
-                        e.args += (p_i.shape, arr_i.shape)
-                        raise
-                    print("Initialize PyTorch weight {} for layer {}".format(name, i))
-                    p_i.data = torch.from_numpy(arr_i)
-            else:
-                try:
-                    assert pointer.shape == array.shape
-                except AssertionError as e:
-                    e.args += (pointer.shape, array.shape)
-                    raise
-                print("Initialize PyTorch weight {}".format(name))
-                pointer.data = torch.from_numpy(array)
-            tf_weights.pop(name, None)
-            tf_weights.pop(name + '/Adam', None)
-            tf_weights.pop(name + '/Adam_1', None)
-
-        print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
-
+        model = load_tf_weights_in_transfo_xl(model, config, tf_path)
         # Save pytorch-model
         pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
         pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
@@ -194,6 +150,54 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
             f.write(config.to_json_string())
 
 
+def load_tf_weights_in_transfo_xl(model, config, tf_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    # Build TF to PyTorch weights loading map
+    tf_to_pt_map = build_tf_to_pytorch_map(model, config)
+
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    tf_weights = {}
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        tf_weights[name] = array
+
+    for name, pointer in tf_to_pt_map.items():
+        assert name in tf_weights
+        array = tf_weights[name]
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if 'kernel' in name or 'proj' in name:
+            array = np.transpose(array)
+        if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:
+            # Here we will split the TF weigths
+            assert len(pointer) == array.shape[0]
+            for i, p_i in enumerate(pointer):
+                arr_i = array[i, ...]
+                try:
+                    assert p_i.shape == arr_i.shape
+                except AssertionError as e:
+                    e.args += (p_i.shape, arr_i.shape)
+                    raise
+                print("Initialize PyTorch weight {} for layer {}".format(name, i))
+                p_i.data = torch.from_numpy(arr_i)
+        else:
+            try:
+                assert pointer.shape == array.shape
+            except AssertionError as e:
+                e.args += (pointer.shape, array.shape)
+                raise
+            print("Initialize PyTorch weight {}".format(name))
+            pointer.data = torch.from_numpy(array)
+        tf_weights.pop(name, None)
+        tf_weights.pop(name + '/Adam', None)
+        tf_weights.pop(name + '/Adam_1', None)
+
+    print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    return model
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 591082f7ce..1e6966757e 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -33,6 +33,7 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from .file_utils import cached_path
+from .convert_tf_checkpoint_to_pytorch import load_tf_weights_in_bert
 
 logger = logging.getLogger(__name__)
 
@@ -47,6 +48,7 @@ PRETRAINED_MODEL_ARCHIVE_MAP = {
 }
 CONFIG_NAME = 'bert_config.json'
 WEIGHTS_NAME = 'pytorch_model.bin'
+TF_WEIGHTS_NAME = 'model.ckpt'
 
 def gelu(x):
     """Implementation of the gelu activation function.
@@ -445,7 +447,8 @@ class BertPreTrainedModel(nn.Module):
             module.bias.data.zero_()
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, *inputs, **kwargs):
+    def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None,
+                        from_tf=False, *inputs, **kwargs):
         """
         Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
@@ -463,6 +466,10 @@ class BertPreTrainedModel(nn.Module):
                 - a path or url to a pretrained model archive containing:
                     . `bert_config.json` a configuration file for the model
                     . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `model.chkpt` a TensorFlow checkpoint
+            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
             state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
             *inputs, **kwargs: additional input for the specific Bert class
@@ -490,7 +497,7 @@ class BertPreTrainedModel(nn.Module):
             logger.info("loading archive file {} from cache at {}".format(
                 archive_file, resolved_archive_file))
         tempdir = None
-        if os.path.isdir(resolved_archive_file):
+        if os.path.isdir(resolved_archive_file) or from_tf:
             serialization_dir = resolved_archive_file
         else:
             # Extract archive to temp dir
@@ -506,10 +513,17 @@ class BertPreTrainedModel(nn.Module):
         logger.info("Model config {}".format(config))
         # Instantiate model.
         model = cls(config, *inputs, **kwargs)
-        if state_dict is None:
+        if state_dict is None and not from_tf:
             weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
             state_dict = torch.load(weights_path)
-
+        if tempdir:
+            # Clean up temp dir
+            shutil.rmtree(tempdir)
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint
+            weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
+            return load_tf_weights_in_bert(model, weights_path)
+        # Load from a PyTorch state_dict
         old_keys = []
         new_keys = []
         for key in state_dict.keys():
@@ -550,9 +564,6 @@ class BertPreTrainedModel(nn.Module):
         if len(error_msgs) > 0:
             raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
                                model.__class__.__name__, "\n\t".join(error_msgs)))
-        if tempdir:
-            # Clean up temp dir
-            shutil.rmtree(tempdir)
         return model
 
 
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index c3cd165e68..cd72beba66 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -32,14 +32,14 @@ from torch.nn.parameter import Parameter
 
 from .modeling import BertLayerNorm as LayerNorm
 from .file_utils import cached_path
+from .convert_openai_checkpoint_to_pytorch import load_tf_weights_in_openai_gpt
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt.tar.gz",
-}
-CONFIG_NAME = 'openai_gpt_config.json'
-WEIGHTS_NAME = 'pytorch_model.bin'
+PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt.tar.gz"}
+CONFIG_NAME = "openai_gpt_config.json"
+WEIGHTS_NAME = "pytorch_model.bin"
+
 
 def gelu(x):
     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
@@ -49,27 +49,27 @@ def swish(x):
     return x * torch.sigmoid(x)
 
 
-ACT_FNS = {
-    'relu': nn.ReLU,
-    'swish': swish,
-    'gelu': gelu
-}
+ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
+
 
 class OpenAIGPTConfig(object):
     """Configuration class to store the configuration of a `OpenAIGPTModel`.
     """
-    def __init__(self,
-                 vocab_size_or_config_json_file=40478,
-                 n_special=0,
-                 n_ctx=512,
-                 n_embd=768,
-                 n_layer=12,
-                 n_head=12,
-                 afn="gelu",
-                 resid_pdrop=0.1,
-                 embd_pdrop=0.1,
-                 attn_pdrop=0.1,
-                 initializer_range=0.02):
+
+    def __init__(
+        self,
+        vocab_size_or_config_json_file=40478,
+        n_special=0,
+        n_ctx=512,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        afn="gelu",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        initializer_range=0.02,
+    ):
         """Constructs OpenAIGPTConfig.
 
         Args:
@@ -91,7 +91,7 @@ class OpenAIGPTConfig(object):
                 initializing all weight matrices.
         """
         if isinstance(vocab_size_or_config_json_file, str):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
                 json_config = json.loads(reader.read())
             for key, value in json_config.items():
                 self.__dict__[key] = value
@@ -108,8 +108,10 @@ class OpenAIGPTConfig(object):
             self.attn_pdrop = attn_pdrop
             self.initializer_range = initializer_range
         else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             "or the path to a pretrained model config file (str)")
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
 
     @property
     def total_num_embeddings(self):
@@ -126,7 +128,7 @@ class OpenAIGPTConfig(object):
     @classmethod
     def from_json_file(cls, json_file):
         """Constructs a `OpenAIGPTConfig` from a json file of parameters."""
-        with open(json_file, "r", encoding='utf-8') as reader:
+        with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()
         return cls.from_dict(json.loads(text))
 
@@ -142,6 +144,7 @@ class OpenAIGPTConfig(object):
         """Serializes this instance to a JSON string."""
         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 
+
 class Conv1D(nn.Module):
     def __init__(self, nf, rf, nx):
         super(Conv1D, self).__init__()
@@ -171,7 +174,7 @@ class Attention(nn.Module):
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implem]
         assert n_state % config.n_head == 0
-        self.register_buffer('b', torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+        self.register_buffer("b", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
         self.n_head = config.n_head
         self.split_size = n_state
         self.scale = scale
@@ -186,7 +189,7 @@ class Attention(nn.Module):
             w = w / math.sqrt(v.size(-1))
         # w = w * self.b + -1e9 * (1 - self.b)  # TF implem method: mask_attn_weights
         # XD: self.b may be larger than w, so we need to crop it
-        b = self.b[:, :, :w.size(-2), :w.size(-1)]
+        b = self.b[:, :, : w.size(-2), : w.size(-1)]
         w = w * b + -1e9 * (1 - b)
 
         w = nn.Softmax(dim=-1)(w)
@@ -262,7 +265,7 @@ class OpenAIGPTLMHead(nn.Module):
     def set_embeddings_weights(self, model_embeddings_weights):
         embed_shape = model_embeddings_weights.shape
         self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
-        self.decoder.weight = model_embeddings_weights # Tied weights
+        self.decoder.weight = model_embeddings_weights  # Tied weights
 
     def forward(self, hidden_state):
         # Truncated Language modeling logits (we remove the last token)
@@ -281,14 +284,15 @@ class OpenAIGPTMultipleChoiceHead(nn.Module):
         self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
         self.linear = nn.Linear(config.n_embd, 1)
 
-        nn.init.normal_(self.linear.weight, std = 0.02)
+        nn.init.normal_(self.linear.weight, std=0.02)
         nn.init.normal_(self.linear.bias, 0)
 
-    def forward(self, hidden_states, multiple_choice_token_mask):
+    def forward(self, hidden_states, mc_token_mask):
         # Classification logits
         # hidden_states = hidden_states.view(-1, self.n_embd)
-        # multiple_choice_token_mask = multiple_choice_token_mask.view(-1, 1).expand_as(hidden_states)
-        multiple_choice_h = hidden_states * multiple_choice_token_mask.unsqueeze(-1)
+        # mc_token_mask = mc_token_mask.view(-1, 1).expand_as(hidden_states)
+        mc_token_mask = mc_token_mask.float()
+        multiple_choice_h = hidden_states * mc_token_mask.unsqueeze(-1)
         multiple_choice_h = multiple_choice_h.sum(dim=-2)
         # flat = x[..., 0].contiguous().view(-1)
         # multiple_choice_h = multiple_choice_h[flat == self.multiple_choice_token, :]
@@ -307,6 +311,7 @@ class OpenAIGPTPreTrainedModel(nn.Module):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(OpenAIGPTPreTrainedModel, self).__init__()
         if not isinstance(config, OpenAIGPTConfig):
@@ -315,7 +320,8 @@ class OpenAIGPTPreTrainedModel(nn.Module):
                 "To create a model from a pretrained model use "
                 "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
                     self.__class__.__name__, self.__class__.__name__
-                ))
+                )
+            )
         self.config = config
 
     def init_weights(self, module):
@@ -335,8 +341,9 @@ class OpenAIGPTPreTrainedModel(nn.Module):
         pass
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name, num_special_tokens=0, state_dict=None, cache_dir=None,
-                        *inputs, **kwargs):
+    def from_pretrained(
+        cls, pretrained_model_name, num_special_tokens=None, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs
+    ):
         """
         Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
@@ -348,6 +355,10 @@ class OpenAIGPTPreTrainedModel(nn.Module):
                 - a path or url to a pretrained model archive containing:
                     . `openai_gpt_config.json` a configuration file for the model
                     . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . a series of NumPy files containing OpenAI TensorFlow trained weights
+            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
             state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
             *inputs, **kwargs: additional input for the specific Bert class
@@ -365,24 +376,22 @@ class OpenAIGPTPreTrainedModel(nn.Module):
                 "Model name '{}' was not found in model name list ({}). "
                 "We assumed '{}' was a path or url but couldn't find any file "
                 "associated to this path or url.".format(
-                    pretrained_model_name,
-                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
-                    archive_file))
+                    pretrained_model_name, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), archive_file
+                )
+            )
             return None
         if resolved_archive_file == archive_file:
             logger.info("loading archive file {}".format(archive_file))
         else:
-            logger.info("loading archive file {} from cache at {}".format(
-                archive_file, resolved_archive_file))
+            logger.info("loading archive file {} from cache at {}".format(archive_file, resolved_archive_file))
         tempdir = None
         if os.path.isdir(resolved_archive_file):
             serialization_dir = resolved_archive_file
         else:
             # Extract archive to temp dir
             tempdir = tempfile.mkdtemp()
-            logger.info("extracting archive file {} to temp dir {}".format(
-                resolved_archive_file, tempdir))
-            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+            logger.info("extracting archive file {} to temp dir {}".format(resolved_archive_file, tempdir))
+            with tarfile.open(resolved_archive_file, "r:gz") as archive:
                 archive.extractall(tempdir)
             serialization_dir = tempdir
         # Load config
@@ -391,18 +400,24 @@ class OpenAIGPTPreTrainedModel(nn.Module):
         logger.info("Model config {}".format(config))
         # Instantiate model.
         model = cls(config, *inputs, **kwargs)
-        if state_dict is None:
+        if state_dict is None and not from_tf:
             weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
-            state_dict = torch.load(weights_path)
+            state_dict = torch.load(weights_path, map_location='cpu' if not torch.cuda.is_available() else None)
+        if tempdir:
+            # Clean up temp dir
+            shutil.rmtree(tempdir)
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint (stored as NumPy array)
+            return load_tf_weights_in_openai_gpt(model, serialization_dir)
 
         old_keys = []
         new_keys = []
         for key in state_dict.keys():
             new_key = None
-            if 'gamma' in key:
-                new_key = key.replace('gamma', 'weight')
-            if 'beta' in key:
-                new_key = key.replace('beta', 'bias')
+            if "gamma" in key:
+                new_key = key.replace("gamma", "weight")
+            if "beta" in key:
+                new_key = key.replace("beta", "bias")
             if new_key:
                 old_keys.append(key)
                 new_keys.append(new_key)
@@ -413,34 +428,36 @@ class OpenAIGPTPreTrainedModel(nn.Module):
         unexpected_keys = []
         error_msgs = []
         # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, '_metadata', None)
+        metadata = getattr(state_dict, "_metadata", None)
         state_dict = state_dict.copy()
         if metadata is not None:
             state_dict._metadata = metadata
 
-        def load(module, prefix=''):
+        def load(module, prefix=""):
             local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
             module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
+            )
             for name, child in module._modules.items():
                 if child is not None:
-                    load(child, prefix + name + '.')
-        load(model.transformer if hasattr(model, 'transformer') else model, prefix='')
+                    load(child, prefix + name + ".")
+
+        load(model.transformer if hasattr(model, "transformer") else model, prefix="")
         if len(missing_keys) > 0:
-            logger.info("Weights of {} not initialized from pretrained model: {}".format(
-                model.__class__.__name__, missing_keys))
+            logger.info(
+                "Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)
+            )
         if len(unexpected_keys) > 0:
-            logger.info("Weights from pretrained model not used in {}: {}".format(
-                model.__class__.__name__, unexpected_keys))
+            logger.info(
+                "Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
+            )
         if len(error_msgs) > 0:
-            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                               model.__class__.__name__, "\n\t".join(error_msgs)))
+            raise RuntimeError(
+                "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
+            )
         # Add additional embeddings for special tokens if needed
-        if num_special_tokens != config.n_special:
+        if num_special_tokens is not None and num_special_tokens != config.n_special:
             model.set_num_special_tokens(num_special_tokens)
-        if tempdir:
-            # Clean up temp dir
-            shutil.rmtree(tempdir)
         return model
 
 
@@ -495,6 +512,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     hidden_states = model(input_ids)
     ```
     """
+
     def __init__(self, config):
         super(OpenAIGPTModel, self).__init__(config)
         total_embeddings_size = config.vocab_size + config.n_special + config.n_ctx
@@ -516,8 +534,8 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         # Initialize all new embeddings (in particular the special tokens)
         self.init_weights(self.embed)
         # Copy word and positional embeddings from the previous weights
-        self.embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
-        self.embed.weight.data[-self.config.n_ctx:, :] = old_embed.weight.data[-self.config.n_ctx:, :]
+        self.embed.weight.data[: self.config.vocab_size, :] = old_embed.weight.data[: self.config.vocab_size, :]
+        self.embed.weight.data[-self.config.n_ctx :, :] = old_embed.weight.data[-self.config.n_ctx :, :]
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None):
         if position_ids is None:
@@ -544,6 +562,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
             hidden_states = block(hidden_states)
         return hidden_states.view(*input_shape, hidden_states.size(-1))
 
+
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     """OpenAI GPT model with a Language Modeling head ("Improving Language Understanding by Generative Pre-Training").
 
@@ -602,6 +621,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     lm_logits = model(input_ids)
     ```
     """
+
     def __init__(self, config):
         super(OpenAIGPTLMHeadModel, self).__init__(config)
         self.transformer = OpenAIGPTModel(config)
@@ -622,6 +642,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
             return loss
         return lm_logits
 
+
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     """OpenAI GPT model with a Language Modeling and a Multiple Choice heads ("Improving Language Understanding by Generative Pre-Training").
 
@@ -653,7 +674,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
             with the word BPE token indices selected in the range [0, config.vocab_size[
-        `multiple_choice_token_mask`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+        `mc_token_mask`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
             with a value of 1 were the last hidden state is (usually the [CLS] token) and 0 otherwise.
         `position_ids`: an optional torch.LongTensor with the same shape as input_ids
             with the position indices (selected in the range [config.vocab_size + config.n_special,
@@ -678,14 +699,15 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     ```python
     # Already been converted into BPE token ids
     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    multiple_choice_token_mask = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    mc_token_mask = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 
     config = modeling_openai.OpenAIGPTConfig()
 
     model = modeling_openai.OpenAIGPTLMHeadModel(config)
-    lm_logits, multiple_choice_logits = model(input_ids, multiple_choice_token_mask)
+    lm_logits, multiple_choice_logits = model(input_ids, mc_token_mask)
     ```
     """
+
     def __init__(self, config):
         super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
         self.transformer = OpenAIGPTModel(config)
@@ -698,18 +720,17 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
 
-    def forward(self, input_ids, multiple_choice_token_mask, position_ids=None, token_type_ids=None,
-                lm_labels=None, multiple_choice_labels=None):
+    def forward(self, input_ids, mc_token_mask, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None):
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
         lm_logits = self.lm_head(hidden_states)
-        multiple_choice_logits = self.multiple_choice_head(hidden_states, multiple_choice_token_mask)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_mask)
         losses = []
         if lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
-        if multiple_choice_labels is not None:
+        if mc_labels is not None:
             loss_fct = CrossEntropyLoss()
-            losses.append(loss_fct(multiple_choice_logits, multiple_choice_labels.view(-1)))
+            losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
         if losses:
             return losses
-        return lm_logits, multiple_choice_logits
+        return lm_logits, mc_logits
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index ba8994fd8a..54c387c34b 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -37,6 +37,7 @@ from torch.nn.parameter import Parameter
 from .modeling import BertLayerNorm as LayerNorm
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
 from .file_utils import cached_path
+from .convert_transfo_xl_checkpoint_to_pytorch import load_tf_weights_in_transfo_xl
 
 logger = logging.getLogger(__name__)
 
@@ -48,6 +49,7 @@ PRETRAINED_CONFIG_ARCHIVE_MAP = {
 }
 CONFIG_NAME = 'transfo_xl_config.json'
 WEIGHTS_NAME = 'pytorch_model.bin'
+TF_WEIGHTS_NAME = 'model.ckpt'
 
 class TransfoXLConfig(object):
     """Configuration class to store the configuration of a `TransfoXLModel`.
@@ -749,7 +751,7 @@ class TransfoXLPreTrainedModel(nn.Module):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None,
-                        *inputs, **kwargs):
+                        from_tf=False, *inputs, **kwargs):
         """
         Instantiate a TransfoXLPreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
@@ -761,6 +763,10 @@ class TransfoXLPreTrainedModel(nn.Module):
                 - a path or url to a pretrained model archive containing:
                     . `transfo_xl_config.json` a configuration file for the model
                     . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `model.chkpt` a TensorFlow checkpoint
+            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
             state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
             *inputs, **kwargs: additional input for the specific Bert class
@@ -799,9 +805,12 @@ class TransfoXLPreTrainedModel(nn.Module):
         logger.info("Model config {}".format(config))
         # Instantiate model.
         model = cls(config, *inputs, **kwargs)
-        if state_dict is None:
+        if state_dict is None and not from_tf:
             state_dict = torch.load(resolved_archive_file)
-
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint
+            weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
+            return load_tf_weights_in_transfo_xl(model, weights_path)
         missing_keys = []
         unexpected_keys = []
         error_msgs = []
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index 1492075817..e5e4dbda39 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -130,6 +130,9 @@ class OpenAIGPTTokenizer(object):
         else:
             self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
 
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+
     def set_special_tokens(self, special_tokens):
         self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
 

From b12616fd8ed3bae53733bf131b987e2a94835aa2 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 28 Jan 2019 17:03:39 +0100
Subject: [PATCH 20/82] updating code organization to fix imports

---
 .../convert_openai_checkpoint_to_pytorch.py   |  62 +--------
 .../convert_tf_checkpoint_to_pytorch.py       |  53 +-------
 ...onvert_transfo_xl_checkpoint_to_pytorch.py | 118 +-----------------
 pytorch_pretrained_bert/modeling.py           |  54 +++++++-
 pytorch_pretrained_bert/modeling_openai.py    |  62 ++++++++-
 .../modeling_transfo_xl.py                    | 118 +++++++++++++++++-
 .../modeling_transfo_xl_utilities.py          |   2 +-
 7 files changed, 235 insertions(+), 234 deletions(-)

diff --git a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
index 40740083d0..83eed843bf 100755
--- a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
@@ -24,7 +24,7 @@ import argparse
 import torch
 import numpy as np
 
-from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel, CONFIG_NAME, WEIGHTS_NAME
+from .modeling_openai import load_tf_weights_in_openai_gpt, OpenAIGPTConfig, OpenAIGPTModel, CONFIG_NAME, WEIGHTS_NAME
 
 def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
     # Construct model
@@ -46,66 +46,6 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c
     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
         f.write(config.to_json_string())
 
-def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
-    """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
-    """
-    print("Loading weights...")
-    names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
-    shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
-    offsets = np.cumsum([np.prod(shape) for shape in shapes])
-    init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
-    init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
-    init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
-
-    init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
-    del init_params[1]
-    init_params = [arr.squeeze() for arr in init_params]
-
-    try:
-        assert model.embed.weight.shape == init_params[0].shape
-    except AssertionError as e:
-        e.args += (model.embed.weight.shape, init_params[0].shape)
-        raise
-
-    model.embed.weight.data = torch.from_numpy(init_params[0])
-    names.pop(0)
-    init_params.pop(0)
-
-    for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]):
-        name = name[6:]  # skip "model/"
-        assert name[-2:] == ":0"
-        name = name[:-2]
-        name = name.split('/')
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
-                l = re.split(r'(\d+)', m_name)
-            else:
-                l = [m_name]
-            if l[0] == 'g':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'b':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'w':
-                pointer = getattr(pointer, 'weight')
-            else:
-                pointer = getattr(pointer, l[0])
-            if len(l) >= 2:
-                num = int(l[1])
-                pointer = pointer[num]
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        print("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
-    return model
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
index 74622bbb70..c5e3090c8d 100755
--- a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
@@ -25,7 +25,7 @@ import tensorflow as tf
 import torch
 import numpy as np
 
-from .modeling import BertConfig, BertForPreTraining
+from .modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert
 
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
     # Initialise PyTorch model
@@ -40,57 +40,6 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor
     print("Save PyTorch model to {}".format(pytorch_dump_path))
     torch.save(model.state_dict(), pytorch_dump_path)
 
-def load_tf_weights_in_bert(model, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model
-    """
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {}".format(tf_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name.split('/')
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(n in ["adam_v", "adam_m"] for n in name):
-            print("Skipping {}".format("/".join(name)))
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
-                l = re.split(r'_(\d+)', m_name)
-            else:
-                l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            else:
-                pointer = getattr(pointer, l[0])
-            if len(l) >= 2:
-                num = int(l[1])
-                pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
-            pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
-            array = np.transpose(array)
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        print("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
-    return model
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
index 4dbb2067d6..594d01bfa3 100755
--- a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -27,7 +27,7 @@ import tensorflow as tf
 import torch
 import numpy as np
 
-from pytorch_pretrained_bert.modeling_transfo_xl import TransfoXLConfig, TransfoXLModel, CONFIG_NAME, WEIGHTS_NAME
+from pytorch_pretrained_bert.modeling_transfo_xl import TransfoXLConfig, TransfoXLModel, CONFIG_NAME, WEIGHTS_NAME, load_tf_weights_in_transfo_xl
 from pytorch_pretrained_bert.tokenization_transfo_xl import VOCAB_NAME, CORPUS_NAME
 
 # We do this to be able to load the python 2 datasets pickles
@@ -38,74 +38,6 @@ data_utils.Corpus = data_utils.TransfoXLCorpus
 sys.modules['data_utils'] = data_utils
 sys.modules['vocabulary'] = data_utils
 
-def build_tf_to_pytorch_map(model, config):
-    """ A map of modules from TF to PyTorch.
-        This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
-    """
-    tf_to_pt_map = {}
-    # Embeddings cutoffs
-    for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
-        layer_str = "transformer/adaptive_embed/cutoff_%d/" % i
-        tf_to_pt_map.update({
-            layer_str + 'lookup_table': embed_l.weight,
-            layer_str + 'proj_W': proj_l
-            })
-
-    # Transformer blocks
-    for i, b in enumerate(model.layers):
-        layer_str = "transformer/layer_%d/" % i
-        tf_to_pt_map.update({
-            layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight,
-            layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias,
-            layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight,
-            layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight,
-            layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight,
-            layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight,
-            layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias,
-            layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight,
-            layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias,
-            layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight,
-            layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
-        })
-
-    # Adaptive Softmax
-    tf_to_pt_map.update({
-        "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight,
-        "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias})
-    for i, (out_l, proj_l, tie_proj) in enumerate(zip(
-                            model.crit.out_layers,
-                            model.crit.out_projs,
-                            config.tie_projs)):
-        layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i
-        if config.tie_weight:
-            tf_to_pt_map.update({
-                layer_str + 'b': out_l.bias})
-        else:
-            raise NotImplementedError
-            # I don't think this is implemented in the TF code
-            tf_to_pt_map.update({
-                layer_str + 'lookup_table': out_l.weight,
-                layer_str + 'b': out_l.bias})
-        if not tie_proj:
-            tf_to_pt_map.update({
-                layer_str + 'proj': proj_l
-                })
-
-    # Relative positioning biases
-    if config.untie_r:
-        r_r_list = []
-        r_w_list = []
-        for b in model.layers:
-            r_r_list.append(b.dec_attn.r_r_bias)
-            r_w_list.append(b.dec_attn.r_w_bias)
-    else:
-        r_r_list = [model.r_r_bias]
-        r_w_list = [model.r_w_bias]
-    tf_to_pt_map.update({
-        'transformer/r_r_bias': r_r_list,
-        'transformer/r_w_bias': r_w_list})
-    return tf_to_pt_map
-
 def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
                                              transfo_xl_config_file,
                                              pytorch_dump_folder_path,
@@ -150,54 +82,6 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
             f.write(config.to_json_string())
 
 
-def load_tf_weights_in_transfo_xl(model, config, tf_path):
-    """ Load tf checkpoints in a pytorch model
-    """
-    # Build TF to PyTorch weights loading map
-    tf_to_pt_map = build_tf_to_pytorch_map(model, config)
-
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    tf_weights = {}
-    for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        tf_weights[name] = array
-
-    for name, pointer in tf_to_pt_map.items():
-        assert name in tf_weights
-        array = tf_weights[name]
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if 'kernel' in name or 'proj' in name:
-            array = np.transpose(array)
-        if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:
-            # Here we will split the TF weigths
-            assert len(pointer) == array.shape[0]
-            for i, p_i in enumerate(pointer):
-                arr_i = array[i, ...]
-                try:
-                    assert p_i.shape == arr_i.shape
-                except AssertionError as e:
-                    e.args += (p_i.shape, arr_i.shape)
-                    raise
-                print("Initialize PyTorch weight {} for layer {}".format(name, i))
-                p_i.data = torch.from_numpy(arr_i)
-        else:
-            try:
-                assert pointer.shape == array.shape
-            except AssertionError as e:
-                e.args += (pointer.shape, array.shape)
-                raise
-            print("Initialize PyTorch weight {}".format(name))
-            pointer.data = torch.from_numpy(array)
-        tf_weights.pop(name, None)
-        tf_weights.pop(name + '/Adam', None)
-        tf_weights.pop(name + '/Adam_1', None)
-
-    print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
-    return model
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     ## Required parameters
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 1e6966757e..00e1d44870 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -33,7 +33,6 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from .file_utils import cached_path
-from .convert_tf_checkpoint_to_pytorch import load_tf_weights_in_bert
 
 logger = logging.getLogger(__name__)
 
@@ -50,6 +49,59 @@ CONFIG_NAME = 'bert_config.json'
 WEIGHTS_NAME = 'pytorch_model.bin'
 TF_WEIGHTS_NAME = 'model.ckpt'
 
+def load_tf_weights_in_bert(model, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m"] for n in name):
+            print("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
 def gelu(x):
     """Implementation of the gelu activation function.
         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index cd72beba66..030e8912ae 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -32,7 +32,6 @@ from torch.nn.parameter import Parameter
 
 from .modeling import BertLayerNorm as LayerNorm
 from .file_utils import cached_path
-from .convert_openai_checkpoint_to_pytorch import load_tf_weights_in_openai_gpt
 
 logger = logging.getLogger(__name__)
 
@@ -40,6 +39,67 @@ PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.h
 CONFIG_NAME = "openai_gpt_config.json"
 WEIGHTS_NAME = "pytorch_model.bin"
 
+def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
+    """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
+    """
+    print("Loading weights...")
+    names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
+    shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
+    offsets = np.cumsum([np.prod(shape) for shape in shapes])
+    init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
+    init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
+    init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
+
+    init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
+    del init_params[1]
+    init_params = [arr.squeeze() for arr in init_params]
+
+    try:
+        assert model.embed.weight.shape == init_params[0].shape
+    except AssertionError as e:
+        e.args += (model.embed.weight.shape, init_params[0].shape)
+        raise
+
+    model.embed.weight.data = torch.from_numpy(init_params[0])
+    names.pop(0)
+    init_params.pop(0)
+
+    for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]):
+        name = name[6:]  # skip "model/"
+        assert name[-2:] == ":0"
+        name = name[:-2]
+        name = name.split('/')
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
+                l = re.split(r'(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'g':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'b':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'w':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
 
 def gelu(x):
     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 54c387c34b..6abc68abc2 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -37,7 +37,6 @@ from torch.nn.parameter import Parameter
 from .modeling import BertLayerNorm as LayerNorm
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
 from .file_utils import cached_path
-from .convert_transfo_xl_checkpoint_to_pytorch import load_tf_weights_in_transfo_xl
 
 logger = logging.getLogger(__name__)
 
@@ -51,6 +50,123 @@ CONFIG_NAME = 'transfo_xl_config.json'
 WEIGHTS_NAME = 'pytorch_model.bin'
 TF_WEIGHTS_NAME = 'model.ckpt'
 
+def build_tf_to_pytorch_map(model, config):
+    """ A map of modules from TF to PyTorch.
+        This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
+    """
+    tf_to_pt_map = {}
+    # Embeddings cutoffs
+    for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
+        layer_str = "transformer/adaptive_embed/cutoff_%d/" % i
+        tf_to_pt_map.update({
+            layer_str + 'lookup_table': embed_l.weight,
+            layer_str + 'proj_W': proj_l
+            })
+
+    # Transformer blocks
+    for i, b in enumerate(model.layers):
+        layer_str = "transformer/layer_%d/" % i
+        tf_to_pt_map.update({
+            layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight,
+            layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias,
+            layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight,
+            layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight,
+            layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight,
+            layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight,
+            layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias,
+            layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight,
+            layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias,
+            layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight,
+            layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
+        })
+
+    # Adaptive Softmax
+    tf_to_pt_map.update({
+        "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight,
+        "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias})
+    for i, (out_l, proj_l, tie_proj) in enumerate(zip(
+                            model.crit.out_layers,
+                            model.crit.out_projs,
+                            config.tie_projs)):
+        layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i
+        if config.tie_weight:
+            tf_to_pt_map.update({
+                layer_str + 'b': out_l.bias})
+        else:
+            raise NotImplementedError
+            # I don't think this is implemented in the TF code
+            tf_to_pt_map.update({
+                layer_str + 'lookup_table': out_l.weight,
+                layer_str + 'b': out_l.bias})
+        if not tie_proj:
+            tf_to_pt_map.update({
+                layer_str + 'proj': proj_l
+                })
+
+    # Relative positioning biases
+    if config.untie_r:
+        r_r_list = []
+        r_w_list = []
+        for b in model.layers:
+            r_r_list.append(b.dec_attn.r_r_bias)
+            r_w_list.append(b.dec_attn.r_w_bias)
+    else:
+        r_r_list = [model.r_r_bias]
+        r_w_list = [model.r_w_bias]
+    tf_to_pt_map.update({
+        'transformer/r_r_bias': r_r_list,
+        'transformer/r_w_bias': r_w_list})
+    return tf_to_pt_map
+
+def load_tf_weights_in_transfo_xl(model, config, tf_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    # Build TF to PyTorch weights loading map
+    tf_to_pt_map = build_tf_to_pytorch_map(model, config)
+
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    tf_weights = {}
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        tf_weights[name] = array
+
+    for name, pointer in tf_to_pt_map.items():
+        assert name in tf_weights
+        array = tf_weights[name]
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if 'kernel' in name or 'proj' in name:
+            array = np.transpose(array)
+        if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:
+            # Here we will split the TF weigths
+            assert len(pointer) == array.shape[0]
+            for i, p_i in enumerate(pointer):
+                arr_i = array[i, ...]
+                try:
+                    assert p_i.shape == arr_i.shape
+                except AssertionError as e:
+                    e.args += (p_i.shape, arr_i.shape)
+                    raise
+                print("Initialize PyTorch weight {} for layer {}".format(name, i))
+                p_i.data = torch.from_numpy(arr_i)
+        else:
+            try:
+                assert pointer.shape == array.shape
+            except AssertionError as e:
+                e.args += (pointer.shape, array.shape)
+                raise
+            print("Initialize PyTorch weight {}".format(name))
+            pointer.data = torch.from_numpy(array)
+        tf_weights.pop(name, None)
+        tf_weights.pop(name + '/Adam', None)
+        tf_weights.pop(name + '/Adam_1', None)
+
+    print("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    return model
+
+
 class TransfoXLConfig(object):
     """Configuration class to store the configuration of a `TransfoXLModel`.
     """
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
index a9ead38faf..4cd04b67a7 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
@@ -291,7 +291,7 @@ if __name__ == '__main__':
     # sampler = LogUniformSampler(n_vocab, unique=False)
     # new_labels, sample, sample_prob = sampler.sample(n_sample, labels)
 
-    sampler = LogUniformSampler(n_vocab, unique=True)
+    sampler = LogUniformSampler(n_vocab, n_sample)#, unique=True)
     # true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels)
 
     # print('true_probs', true_probs.numpy().tolist())

From a45a9cc0e146a7375a21b45f882e21f1704460d2 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 28 Jan 2019 17:16:02 +0100
Subject: [PATCH 21/82] update tests

---
 tests/modeling_openai_test.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py
index 0a71166443..014554934c 100644
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@@ -88,13 +88,13 @@ class OpenAIGPTModelTest(unittest.TestCase):
                 total_voc = self.n_ctx + self.n_special + self.vocab_size
                 token_type_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
 
-            multiple_choice_labels = None
+            mc_labels = None
             lm_labels = None
-            multiple_choice_token_mask = None
+            mc_token_mask = None
             if self.use_labels:
-                multiple_choice_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
+                mc_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
                 lm_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
-                multiple_choice_token_mask = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], 2).float()
+                mc_token_mask = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], 2).float()
 
             config = OpenAIGPTConfig(
                 vocab_size_or_config_json_file=self.vocab_size,
@@ -110,10 +110,10 @@ class OpenAIGPTModelTest(unittest.TestCase):
                 initializer_range=self.initializer_range)
 
             return (config, input_ids, token_type_ids, position_ids,
-                    multiple_choice_labels, lm_labels, multiple_choice_token_mask)
+                    mc_labels, lm_labels, mc_token_mask)
 
         def create_openai_model(self, config, input_ids, token_type_ids, position_ids,
-                                multiple_choice_labels, lm_labels, multiple_choice_token_mask):
+                                mc_labels, lm_labels, mc_token_mask):
             model = OpenAIGPTModel(config)
             hidden_states = model(input_ids, position_ids, token_type_ids)
             outputs = {
@@ -128,7 +128,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
 
 
         def create_openai_lm_head(self, config, input_ids, token_type_ids, position_ids,
-                                       multiple_choice_labels, lm_labels, multiple_choice_token_mask):
+                                       mc_labels, lm_labels, mc_token_mask):
             model = OpenAIGPTLMHeadModel(config)
             loss = model(input_ids, position_ids, token_type_ids, lm_labels)
             lm_logits = model(input_ids, position_ids, token_type_ids)
@@ -150,15 +150,16 @@ class OpenAIGPTModelTest(unittest.TestCase):
                 [])
 
         def create_openai_double_heads(self, config, input_ids, token_type_ids, position_ids,
-                                       multiple_choice_labels, lm_labels, multiple_choice_token_mask):
+                                       mc_labels, lm_labels, mc_token_mask):
             model = OpenAIGPTDoubleHeadsModel(config)
-            loss = model(input_ids, multiple_choice_token_mask, position_ids,
-                         token_type_ids, lm_labels, multiple_choice_labels)
-            lm_logits, multiple_choice_logits = model(input_ids, multiple_choice_token_mask, position_ids, token_type_ids)
+            loss = model(input_ids, mc_token_mask,
+                         lm_labels=lm_labels, mc_labels=mc_labels,
+                         token_type_ids=token_type_ids, position_ids=position_ids)
+            lm_logits, mc_logits = model(input_ids, mc_token_mask, position_ids=position_ids, token_type_ids=token_type_ids)
             outputs = {
                 "loss": loss,
                 "lm_logits": lm_logits,
-                "multiple_choice_logits": multiple_choice_logits,
+                "mc_logits": mc_logits,
             }
             return outputs
 
@@ -168,7 +169,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
                 list(result["lm_logits"].size()),
                 [self.batch_size, self.n_choices, self.seq_length, total_voc])
             self.parent.assertListEqual(
-                list(result["multiple_choice_logits"].size()),
+                list(result["mc_logits"].size()),
                 [self.batch_size, self.n_choices])
 
         def check_openai_double_heads_loss_output(self, result):

From bd3b3aee9ccd7f4ec7f0398420350180722cadc7 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 28 Jan 2019 17:47:29 +0100
Subject: [PATCH 22/82] update

---
 pytorch_pretrained_bert/modeling.py        | 3 ++-
 pytorch_pretrained_bert/modeling_openai.py | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 00e1d44870..dc14eadd82 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -606,7 +606,8 @@ class BertPreTrainedModel(nn.Module):
             for name, child in module._modules.items():
                 if child is not None:
                     load(child, prefix + name + '.')
-        load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
+        start_prefix = 'bert.' if not hasattr(model, 'bert') and any(s.startwith('bert.') for s in state_dict.keys()) else ''
+        load(model, prefix=start_prefix)
         if len(missing_keys) > 0:
             logger.info("Weights of {} not initialized from pretrained model: {}".format(
                 model.__class__.__name__, missing_keys))
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 030e8912ae..88e5690e9b 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -502,7 +502,10 @@ class OpenAIGPTPreTrainedModel(nn.Module):
                 if child is not None:
                     load(child, prefix + name + ".")
 
-        load(model.transformer if hasattr(model, "transformer") else model, prefix="")
+        if hasattr(model, "transformer") and all(not s.startwith('transformer.') for s in state_dict.keys()):
+            start_model = model.transformer
+        load(start_model, prefix="")
+
         if len(missing_keys) > 0:
             logger.info(
                 "Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)

From 9b2540b5a762436a0ac2b603f1fce93451535156 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 29 Jan 2019 09:54:08 +0100
Subject: [PATCH 23/82] update __init__

---
 pytorch_pretrained_bert/__init__.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index 249607bded..e4b9c1a116 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -6,15 +6,15 @@ from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .modeling import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
                        BertForSequenceClassification, BertForMultipleChoice,
-                       BertForTokenClassification, BertForQuestionAnswering)
+                       BertForTokenClassification, BertForQuestionAnswering,
+                       load_tf_weights_in_bert)
 from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
-                              OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
-from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel)
+                              OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
+                              load_tf_weights_in_openai_gpt)
+from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel,
+                                  load_tf_weights_in_transfo_xl)
 
 from .optimization import BertAdam
 from .optimization_openai import OpenAIAdam
 
-from .convert_openai_checkpoint_to_pytorch import load_tf_weights_in_openai_gpt
-from .convert_tf_checkpoint_to_pytorch import load_tf_weights_in_bert
-from .convert_transfo_xl_checkpoint_to_pytorch import load_tf_weights_in_transfo_xl
 from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE

From 5456d82311d0f0896741709df72e9ba9434f6082 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 29 Jan 2019 09:54:18 +0100
Subject: [PATCH 24/82] more versatile model loading

---
 pytorch_pretrained_bert/modeling.py        |  4 ++-
 pytorch_pretrained_bert/modeling_openai.py | 42 ++++++++++++----------
 2 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index dc14eadd82..8d71b8e955 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -606,7 +606,9 @@ class BertPreTrainedModel(nn.Module):
             for name, child in module._modules.items():
                 if child is not None:
                     load(child, prefix + name + '.')
-        start_prefix = 'bert.' if not hasattr(model, 'bert') and any(s.startwith('bert.') for s in state_dict.keys()) else ''
+        start_prefix = ''
+        if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
+            start_prefix = 'bert.'
         load(model, prefix=start_prefix)
         if len(missing_keys) > 0:
             logger.info("Weights of {} not initialized from pretrained model: {}".format(
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 88e5690e9b..e71a3910f8 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -120,6 +120,7 @@ class OpenAIGPTConfig(object):
         self,
         vocab_size_or_config_json_file=40478,
         n_special=0,
+        n_positions=512,
         n_ctx=512,
         n_embd=768,
         n_layer=12,
@@ -135,7 +136,8 @@ class OpenAIGPTConfig(object):
         Args:
             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
             n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
-            n_ctx: Number of positional embeddings.
+            n_positions: Number of positional embeddings.
+            n_ctx: Size of the causal mask (usually same as n_positions).
             n_embd: Dimensionality of the embeddings and hidden states.
             n_layer: Number of hidden layers in the Transformer encoder.
             n_head: Number of attention heads for each attention layer in
@@ -159,6 +161,7 @@ class OpenAIGPTConfig(object):
             self.vocab_size = vocab_size_or_config_json_file
             self.n_special = n_special
             self.n_ctx = n_ctx
+            self.n_positions = n_positions
             self.n_embd = n_embd
             self.n_layer = n_layer
             self.n_head = n_head
@@ -175,7 +178,7 @@ class OpenAIGPTConfig(object):
 
     @property
     def total_num_embeddings(self):
-        return self.vocab_size + self.n_special + self.n_ctx
+        return self.vocab_size + self.n_special + self.n_positions
 
     @classmethod
     def from_dict(cls, json_object):
@@ -234,7 +237,7 @@ class Attention(nn.Module):
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implem]
         assert n_state % config.n_head == 0
-        self.register_buffer("b", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
         self.n_head = config.n_head
         self.split_size = n_state
         self.scale = scale
@@ -247,9 +250,9 @@ class Attention(nn.Module):
         w = torch.matmul(q, k)
         if self.scale:
             w = w / math.sqrt(v.size(-1))
-        # w = w * self.b + -1e9 * (1 - self.b)  # TF implem method: mask_attn_weights
+        # w = w * self.bias + -1e9 * (1 - self.bias)  # TF implem method: mask_attn_weights
         # XD: self.b may be larger than w, so we need to crop it
-        b = self.b[:, :, : w.size(-2), : w.size(-1)]
+        b = self.bias[:, :, : w.size(-2), : w.size(-1)]
         w = w * b + -1e9 * (1 - b)
 
         w = nn.Softmax(dim=-1)(w)
@@ -474,10 +477,12 @@ class OpenAIGPTPreTrainedModel(nn.Module):
         new_keys = []
         for key in state_dict.keys():
             new_key = None
-            if "gamma" in key:
-                new_key = key.replace("gamma", "weight")
-            if "beta" in key:
-                new_key = key.replace("beta", "bias")
+            if key.endswith(".g"):
+                new_key = key[:-2] + ".weight"
+            elif key.endswith(".b"):
+                new_key = key[:-2] + ".bias"
+            elif key.endswith(".w"):
+                new_key = key[:-2] + ".weight"
             if new_key:
                 old_keys.append(key)
                 new_keys.append(new_key)
@@ -502,7 +507,8 @@ class OpenAIGPTPreTrainedModel(nn.Module):
                 if child is not None:
                     load(child, prefix + name + ".")
 
-        if hasattr(model, "transformer") and all(not s.startwith('transformer.') for s in state_dict.keys()):
+        start_model = model
+        if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()):
             start_model = model.transformer
         load(start_model, prefix="")
 
@@ -541,7 +547,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
          total_num_embeddings - 1]                                  ______________________
 
     where total_num_embeddings can be obtained as config.total_num_embeddings and is:
-        total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
+        total_num_embeddings = config.vocab_size + config.n_special + config.n_positions
     You should use the associate indices to index the embeddings.
 
     The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
@@ -554,7 +560,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
             were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
         `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_ctx - 1[.
+            with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_positions - 1[.
         `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
             You can use it to add a third embedding (the previous two being the word and position embeddings)
             to each token in the sentence.
@@ -578,7 +584,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 
     def __init__(self, config):
         super(OpenAIGPTModel, self).__init__(config)
-        total_embeddings_size = config.vocab_size + config.n_special + config.n_ctx
+        total_embeddings_size = config.vocab_size + config.n_special + config.n_positions
         self.embed = nn.Embedding(total_embeddings_size, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
         block = Block(config.n_ctx, config, scale=True)
@@ -598,7 +604,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         self.init_weights(self.embed)
         # Copy word and positional embeddings from the previous weights
         self.embed.weight.data[: self.config.vocab_size, :] = old_embed.weight.data[: self.config.vocab_size, :]
-        self.embed.weight.data[-self.config.n_ctx :, :] = old_embed.weight.data[-self.config.n_ctx :, :]
+        self.embed.weight.data[-self.config.n_positions :, :] = old_embed.weight.data[-self.config.n_positions :, :]
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None):
         if position_ids is None:
@@ -645,7 +651,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
          total_num_embeddings - 1]                                  ______________________
 
     where total_num_embeddings can be obtained as config.total_num_embeddings and is:
-        total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
+        total_num_embeddings = config.vocab_size + config.n_special + config.n_positions
     You should use these indices to index the word, special and position embeddings.
 
     The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
@@ -658,7 +664,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
             were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
         `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_ctx - 1[.
+            with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_positions - 1[.
         `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
             You can use it to add a third embedding (the previous two being the word and position embeddings)
             to each token in the sentence.
@@ -725,7 +731,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
          total_num_embeddings - 1]                                  ______________________
 
     where total_num_embeddings can be obtained as config.total_num_embeddings and is:
-        total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
+        total_num_embeddings = config.vocab_size + config.n_special + config.n_positions
     You should use these indices to index the word, special and position embeddings.
 
     The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
@@ -741,7 +747,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
             with a value of 1 were the last hidden state is (usually the [CLS] token) and 0 otherwise.
         `position_ids`: an optional torch.LongTensor with the same shape as input_ids
             with the position indices (selected in the range [config.vocab_size + config.n_special,
-            config.vocab_size + config.n_special + config.n_ctx - 1[.
+            config.vocab_size + config.n_special + config.n_positions - 1[.
         `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
             You can use it to add a third embedding (the previous two being the word and position embeddings)
             to each token in the sentence.

From 98c96fb1a71d2b4ef026ddf55f5700e79a9ed482 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 29 Jan 2019 10:31:42 +0100
Subject: [PATCH 25/82] splitting position and tokens embeddings in OpenAI GPT
 - updating tf imports - tests

---
 pytorch_pretrained_bert/__main__.py           |  4 +-
 .../convert_openai_checkpoint_to_pytorch.py   |  9 +--
 .../convert_tf_checkpoint_to_pytorch.py       |  2 +-
 pytorch_pretrained_bert/modeling.py           |  8 +++
 pytorch_pretrained_bert/modeling_openai.py    | 65 +++++++++++--------
 .../modeling_transfo_xl.py                    |  7 ++
 tests/modeling_openai_test.py                 | 15 ++---
 7 files changed, 66 insertions(+), 44 deletions(-)

diff --git a/pytorch_pretrained_bert/__main__.py b/pytorch_pretrained_bert/__main__.py
index 1869ff2ee2..3adb4ac36a 100644
--- a/pytorch_pretrained_bert/__main__.py
+++ b/pytorch_pretrained_bert/__main__.py
@@ -14,7 +14,7 @@ def main():
     else:
         if sys.argv[1] == "convert_tf_checkpoint_to_pytorch":
             try:
-                from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+                import tensorflow as tf
             except ModuleNotFoundError:
                 print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
@@ -42,7 +42,7 @@ def main():
                                                  PYTORCH_DUMP_OUTPUT)
         else:
             try:
-                from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
+                import tensorflow as tf
             except ModuleNotFoundError:
                 print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
diff --git a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
index 83eed843bf..3f3e48172b 100755
--- a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
@@ -18,13 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import re
-import json
 import argparse
 import torch
-import numpy as np
 
-from .modeling_openai import load_tf_weights_in_openai_gpt, OpenAIGPTConfig, OpenAIGPTModel, CONFIG_NAME, WEIGHTS_NAME
+from pytorch_pretrained_bert.modeling_openai import load_tf_weights_in_openai_gpt, OpenAIGPTConfig, OpenAIGPTModel, CONFIG_NAME, WEIGHTS_NAME
 
 def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
     # Construct model
@@ -67,5 +64,5 @@ if __name__ == "__main__":
                             "This specifies the model architecture.")
     args = parser.parse_args()
     convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
-                                         args.pytorch_dump_folder_path,
-                                         args.openai_config_file)
+                                         args.openai_config_file,
+                                         args.pytorch_dump_folder_path)
diff --git a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
index c5e3090c8d..2dbf1f296e 100755
--- a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
@@ -25,7 +25,7 @@ import tensorflow as tf
 import torch
 import numpy as np
 
-from .modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+from pytorch_pretrained_bert.modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert
 
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
     # Initialise PyTorch model
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 8d71b8e955..7f0c2bd47b 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -52,6 +52,14 @@ TF_WEIGHTS_NAME = 'model.ckpt'
 def load_tf_weights_in_bert(model, tf_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
     """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ModuleNotFoundError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
     tf_path = os.path.abspath(tf_checkpoint_path)
     print("Converting TensorFlow checkpoint from {}".format(tf_path))
     # Load weights from TF model
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index e71a3910f8..2e2dc56984 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -15,23 +15,23 @@
 # limitations under the License.
 """PyTorch OpenAI GPT model."""
 
-import os
+import collections
 import copy
 import json
-import math
 import logging
+import math
+import os
+import shutil
 import tarfile
 import tempfile
-import shutil
-import collections
 
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .modeling import BertLayerNorm as LayerNorm
 from .file_utils import cached_path
+from .modeling import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
 
@@ -42,6 +42,8 @@ WEIGHTS_NAME = "pytorch_model.bin"
 def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
     """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
     """
+    import re
+    import numpy as np
     print("Loading weights...")
     names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
     shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
@@ -50,18 +52,24 @@ def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
     init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
     init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
 
-    init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
-    del init_params[1]
+    # Thsi as used when we had a single embedding matrix for positions and tokens
+    # init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
+    # del init_params[1]
     init_params = [arr.squeeze() for arr in init_params]
 
     try:
-        assert model.embed.weight.shape == init_params[0].shape
+        assert model.tokens_embed.weight.shape == init_params[1].shape
+        assert model.positions_embed.weight.shape == init_params[0].shape
     except AssertionError as e:
-        e.args += (model.embed.weight.shape, init_params[0].shape)
+        e.args += (model.tokens_embed.weight.shape, init_params[1].shape)
+        e.args += (model.positions_embed.weight.shape, init_params[0].shape)
         raise
 
-    model.embed.weight.data = torch.from_numpy(init_params[0])
+    model.tokens_embed.weight.data = torch.from_numpy(init_params[1])
+    model.positions_embed.weight.data = torch.from_numpy(init_params[0])
     names.pop(0)
+    # Pop position and token embedding arrays
+    init_params.pop(0)
     init_params.pop(0)
 
     for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]):
@@ -584,8 +592,9 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 
     def __init__(self, config):
         super(OpenAIGPTModel, self).__init__(config)
-        total_embeddings_size = config.vocab_size + config.n_special + config.n_positions
-        self.embed = nn.Embedding(total_embeddings_size, config.n_embd)
+        num_tokens = config.vocab_size + config.n_special
+        self.tokens_embed = nn.Embedding(num_tokens, config.n_embd)
+        self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
         block = Block(config.n_ctx, config, scale=True)
         self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
@@ -598,30 +607,32 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         # Update config
         self.config.n_special = num_special_tokens
         # # Build new embeddings and initialize
-        old_embed = self.embed
-        self.embed = nn.Embedding(self.config.total_num_embeddings, self.config.n_embd)
+        old_embed = self.tokens_embed
+        self.tokens_embed = nn.Embedding(self.config.total_num_embeddings, self.config.n_embd)
         # Initialize all new embeddings (in particular the special tokens)
-        self.init_weights(self.embed)
+        self.init_weights(self.tokens_embed)
         # Copy word and positional embeddings from the previous weights
-        self.embed.weight.data[: self.config.vocab_size, :] = old_embed.weight.data[: self.config.vocab_size, :]
-        self.embed.weight.data[-self.config.n_positions :, :] = old_embed.weight.data[-self.config.n_positions :, :]
+        self.tokens_embed.weight.data[: self.config.vocab_size, :] = old_embed.weight.data[: self.config.vocab_size, :]
+        self.tokens_embed.weight.data[-self.config.n_positions :, :] = old_embed.weight.data[-self.config.n_positions :, :]
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None):
         if position_ids is None:
-            start = self.config.vocab_size + self.config.n_special
-            end = start + input_ids.size(-1)
-            position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)
+            # This was used when we had a single embedding matrice from position and token embeddings
+            # start = self.config.vocab_size + self.config.n_special
+            # end = start + input_ids.size(-1)
+            # position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)
+            position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 
         input_shape = input_ids.size()
         input_ids = input_ids.view(-1, input_ids.size(-1))
         position_ids = position_ids.view(-1, position_ids.size(-1))
 
-        inputs_embeds = self.embed(input_ids)
-        position_embeds = self.embed(position_ids)
+        inputs_embeds = self.tokens_embed(input_ids)
+        position_embeds = self.positions_embed(position_ids)
         if token_type_ids is not None:
             token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
-            token_type_embeds = self.embed(token_type_ids)
+            token_type_embeds = self.tokens_embed(token_type_ids)
         else:
             token_type_embeds = 0
         # Add the position information to the input embeddings
@@ -694,13 +705,13 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     def __init__(self, config):
         super(OpenAIGPTLMHeadModel, self).__init__(config)
         self.transformer = OpenAIGPTModel(config)
-        self.lm_head = OpenAIGPTLMHead(self.transformer.embed.weight, config)
+        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens):
         " Update input and output embeddings with new embedding matrice "
         self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None):
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
@@ -780,14 +791,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     def __init__(self, config):
         super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
         self.transformer = OpenAIGPTModel(config)
-        self.lm_head = OpenAIGPTLMHead(self.transformer.embed.weight, config)
+        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
         self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config)
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens):
         " Update input and output embeddings with new embedding matrice "
         self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
 
     def forward(self, input_ids, mc_token_mask, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None):
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 6abc68abc2..204eef738c 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -121,6 +121,13 @@ def build_tf_to_pytorch_map(model, config):
 def load_tf_weights_in_transfo_xl(model, config, tf_path):
     """ Load tf checkpoints in a pytorch model
     """
+    try:
+        import numpy as np
+        import tensorflow as tf
+    except ModuleNotFoundError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
     # Build TF to PyTorch weights loading map
     tf_to_pt_map = build_tf_to_pytorch_map(model, config)
 
diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py
index 014554934c..71a1dfd3c6 100644
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@@ -39,7 +39,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
                      use_labels=True,
                      vocab_size=99,
                      n_special=1,
-                     n_ctx=33,
+                     n_positions=33,
                      n_embd=32,
                      n_layer=5,
                      n_head=4,
@@ -61,7 +61,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
             self.use_labels = use_labels
             self.vocab_size = vocab_size
             self.n_special = n_special
-            self.n_ctx = n_ctx
+            self.n_positions = n_positions
             self.n_embd = n_embd
             self.n_layer = n_layer
             self.n_head = n_head
@@ -80,12 +80,11 @@ class OpenAIGPTModelTest(unittest.TestCase):
 
             position_ids = None
             if self.use_position_ids:
-                position_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_ctx)
-                position_ids = position_ids + self.n_special + self.vocab_size
+                position_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
 
             token_type_ids = None
             if self.use_token_type_ids:
-                total_voc = self.n_ctx + self.n_special + self.vocab_size
+                total_voc = self.vocab_size + self.n_special
                 token_type_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
 
             mc_labels = None
@@ -98,7 +97,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
 
             config = OpenAIGPTConfig(
                 vocab_size_or_config_json_file=self.vocab_size,
-                n_ctx=self.n_ctx,
+                n_positions=self.n_positions,
                 n_special=self.n_special,
                 n_embd=self.n_embd,
                 n_layer=self.n_layer,
@@ -139,7 +138,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
             return outputs
 
         def check_openai_lm_head_output(self, result):
-            total_voc = self.n_ctx + self.n_special + self.vocab_size
+            total_voc = self.n_special + self.vocab_size
             self.parent.assertListEqual(
                 list(result["lm_logits"].size()),
                 [self.batch_size, self.n_choices, self.seq_length, total_voc])
@@ -164,7 +163,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
             return outputs
 
         def check_openai_double_heads_output(self, result):
-            total_voc = self.n_ctx + self.n_special + self.vocab_size
+            total_voc = self.n_special + self.vocab_size
             self.parent.assertListEqual(
                 list(result["lm_logits"].size()),
                 [self.batch_size, self.n_choices, self.seq_length, total_voc])

From 3a848111e6c5a10a4f04f272476de86af78d4a36 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 29 Jan 2019 11:00:11 +0100
Subject: [PATCH 26/82] update config, docstrings and readme to switch to
 seperated tokens and position embeddings

---
 README.md                                  |  37 +++---
 pytorch_pretrained_bert/modeling_openai.py | 130 ++++++++++-----------
 2 files changed, 80 insertions(+), 87 deletions(-)

diff --git a/README.md b/README.md
index be0765f4bb..b124585bbe 100644
--- a/README.md
+++ b/README.md
@@ -391,35 +391,36 @@ An example on how to use this class is given in the [`run_squad.py`](./examples/
 
 `OpenAIGPTModel` is the basic OpenAI GPT Transformer model with a layer of summed token and position embeddings followed by a series of 12 identical self-attention blocks.
 
-The main implementation difference between BERT and the OpenAI is the use, in OpenAI GPT, of a single embedding matrix to store the word, special (`[SEP]`, `[CLS]`...) token and position embeddings.
-The embeddings are ordered as follow in the word embeddings matrice:
+OpenAI GPT use a single embedding matrix to store the word and special embeddings.
+Special tokens embeddings are additional tokens that are not pre-trained: `[SEP]`, `[CLS]`...
+Special tokens need to be trained during the fine-tuning if you use them.
+The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
 
+The embeddings are ordered as follow in the token embeddings matrice:
+
+```python
     [0,                                                         ----------------------
       ...                                                        -> word embeddings
       config.vocab_size - 1,                                     ______________________
       config.vocab_size,
       ...                                                        -> special embeddings
-      config.vocab_size + config.n_special - 1,                  ______________________
-      config.vocab_size + config.n_special,
-      ...                                                        -> position embeddings
-      total_num_embeddings - 1]                                  ______________________
+      config.vocab_size + config.n_special - 1]                  ______________________
+```
 
-where total_num_embeddings can be obtained as config.total_num_embeddings and is:
-
-    total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
+where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+    `total_tokens_embeddings = config.vocab_size + config.n_special`
 You should use the associate indices to index the embeddings.
 
-The special tokens embeddings (`[SEP]`, `[CLS]`...) are not pre-trained and need to be trained during the fine-tuning if you use them.
-
-The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
-
 The inputs and output are **identical to the TensorFlow model inputs and outputs**.
 
 We detail them here. This model takes as *inputs*:
 [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py)
-- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
-- `position_ids`: an optional torch.LongTensor with the same shape as input_ids with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_ctx - 1[.
-- `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids. You can use it to add a third embedding (the previous two being the word and position embeddings) to each token in the sentence.
+- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
+- `position_ids`: an optional torch.LongTensor with the same shape as input_ids
+    with the position indices (selected in the range [0, config.n_positions - 1[.
+- `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
+    You can use it to add a third type of embedding to each input token in the sequence
+    (the previous two being the word and position embeddings). The input, position and token_type embeddings are summed inside the Transformer before the first self-attention block.
 
 This model *outputs*:
 - `hidden_states`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
@@ -435,7 +436,7 @@ This model *outputs*:
 - if `lm_labels` is not `None`:
   Outputs the language modeling loss.
 - else:
-  Outputs `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_num_embeddings] (or more generally [d_1, ..., d_n, total_num_embeddings] were d_1 ... d_n are the dimension of input_ids)
+  Outputs `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_tokens_embeddings] (or more generally [d_1, ..., d_n, total_tokens_embeddings] were d_1 ... d_n are the dimension of input_ids)
 
 #### 11. `OpenAIGPTDoubleHeadsModel`
 
@@ -452,7 +453,7 @@ This model *outputs*:
 - if `lm_labels` and `multiple_choice_labels` are not `None`:
   Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
 - else Outputs a tuple with:
-  - `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_num_embeddings]
+  - `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings]
   - `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
 
 
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 2e2dc56984..14d5cf7ef2 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -185,8 +185,8 @@ class OpenAIGPTConfig(object):
             )
 
     @property
-    def total_num_embeddings(self):
-        return self.vocab_size + self.n_special + self.n_positions
+    def total_tokens_embeddings(self):
+        return self.vocab_size + self.n_special
 
     @classmethod
     def from_dict(cls, json_object):
@@ -533,45 +533,44 @@ class OpenAIGPTPreTrainedModel(nn.Module):
                 "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
             )
         # Add additional embeddings for special tokens if needed
-        if num_special_tokens is not None and num_special_tokens != config.n_special:
-            model.set_num_special_tokens(num_special_tokens)
+        # This step also make sure we are still sharing the output and input embeddings after loading weights
+        model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special)
         return model
 
 
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
 
-    The main implementation difference between BERT and the OpenAI is the use, in OpenAI GPT, of a single embedding matrix
-    to store the word, special ([SEP], [CLS]...) and position embeddings.
-    The embeddings are ordered as follow in the word embeddings matrice:
+    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
         [0,                                                         ----------------------
          ...                                                        -> word embeddings
          config.vocab_size - 1,                                     ______________________
          config.vocab_size,
          ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1,                  ______________________
-         config.vocab_size + config.n_special,
-         ...                                                        -> position embeddings
-         total_num_embeddings - 1]                                  ______________________
+         config.vocab_size + config.n_special - 1]                  ______________________
 
-    where total_num_embeddings can be obtained as config.total_num_embeddings and is:
-        total_num_embeddings = config.vocab_size + config.n_special + config.n_positions
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
     You should use the associate indices to index the embeddings.
 
-    The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
-
     Params:
         config: a OpenAIGPTConfig class instance with the configuration to build a new model
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
+            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
         `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_positions - 1[.
+            with the position indices (selected in the range [0, config.n_positions - 1[.
         `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third embedding (the previous two being the word and position embeddings)
-            to each token in the sentence.
+            You can use it to add a third type of embedding to each input token in the sequence
+            (the previous two being the word and position embeddings).
+            The input, position and token_type embeddings are summed inside the Transformer before the first
+            self-attention block.
 
     Outputs:
         `hidden_states`: the encoded-hidden-states at the top of the model
@@ -603,12 +602,14 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         # nn.init.normal_(self.embed.weight, std=0.02)
 
     def set_num_special_tokens(self, num_special_tokens):
-        " Update input embeddings with new embedding matrice "
+        " Update input embeddings with new embedding matrice if needed "
+        if self.config.n_special == num_special_tokens:
+            return
         # Update config
         self.config.n_special = num_special_tokens
         # # Build new embeddings and initialize
         old_embed = self.tokens_embed
-        self.tokens_embed = nn.Embedding(self.config.total_num_embeddings, self.config.n_embd)
+        self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
         # Initialize all new embeddings (in particular the special tokens)
         self.init_weights(self.tokens_embed)
         # Copy word and positional embeddings from the previous weights
@@ -646,39 +647,36 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     """OpenAI GPT model with a Language Modeling head ("Improving Language Understanding by Generative Pre-Training").
 
-    There are two main implementation differences between BERT and the OpenAI GPT:
-        - the use of an LM loss in OpenAI GPT which means the Transformer is trained to predict the NEXT token for each input token
-            vs. predict the SAME token for BERT (i.e. you need to shift your labels to the right)
-        - the use, in OpenAI GPT, of a single embedding matrix to store the word, special ([SEP], [CLS]...) and position embeddings.
-    The embeddings are ordered as follow in the word embeddings matrice:
+    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
         [0,                                                         ----------------------
          ...                                                        -> word embeddings
          config.vocab_size - 1,                                     ______________________
          config.vocab_size,
          ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1,                  ______________________
-         config.vocab_size + config.n_special,
-         ...                                                        -> position embeddings
-         total_num_embeddings - 1]                                  ______________________
+         config.vocab_size + config.n_special - 1]                  ______________________
 
-    where total_num_embeddings can be obtained as config.total_num_embeddings and is:
-        total_num_embeddings = config.vocab_size + config.n_special + config.n_positions
-    You should use these indices to index the word, special and position embeddings.
-
-    The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
+    You should use the associate indices to index the embeddings.
 
     Params:
         config: a OpenAIGPTConfig class instance with the configuration to build a new model
 
     Inputs:
         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
+            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
         `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_positions - 1[.
+            with the position indices (selected in the range [0, config.n_positions - 1[.
         `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third embedding (the previous two being the word and position embeddings)
-            to each token in the sentence.
+            You can use it to add a third type of embedding to each input token in the sequence
+            (the previous two being the word and position embeddings).
+            The input, position and token_type embeddings are summed inside the Transformer before the first
+            self-attention block.
         `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
             with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
             is only computed for the labels set in [0, ..., vocab_size]
@@ -687,8 +685,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         if `lm_labels` is not `None`:
             Outputs the language modeling loss.
         else:
-            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_num_embeddings]
-                (or more generally [d_1, ..., d_n, total_num_embeddings] were d_1 ... d_n are the dimension of input_ids)
+            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_tokens_embeddings]
+                (or more generally [d_1, ..., d_n, total_tokens_embeddings] were d_1 ... d_n are the dimension of input_ids)
 
     Example usage:
     ```python
@@ -726,45 +724,39 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     """OpenAI GPT model with a Language Modeling and a Multiple Choice heads ("Improving Language Understanding by Generative Pre-Training").
 
-    There are two main implementation differences between BERT and the OpenAI GPT:
-        - the use of an LM loss in OpenAI GPT which means the Transformer is trained to predict the NEXT token for each input token
-            vs. predict the SAME token for BERT (i.e. you need to shift your labels to the right)
-        - the use, in OpenAI GPT, of a single embedding matrix to store the word, special ([SEP], [CLS]...) and position embeddings.
-    The embeddings are ordered as follow in the word embeddings matrice:
+    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
+    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
+    Special tokens need to be trained during the fine-tuning if you use them.
+    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrice:
         [0,                                                         ----------------------
          ...                                                        -> word embeddings
          config.vocab_size - 1,                                     ______________________
          config.vocab_size,
          ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1,                  ______________________
-         config.vocab_size + config.n_special,
-         ...                                                        -> position embeddings
-         total_num_embeddings - 1]                                  ______________________
+         config.vocab_size + config.n_special - 1]                  ______________________
 
-    where total_num_embeddings can be obtained as config.total_num_embeddings and is:
-        total_num_embeddings = config.vocab_size + config.n_special + config.n_positions
-    You should use these indices to index the word, special and position embeddings.
-
-    The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
+        total_tokens_embeddings = config.vocab_size + config.n_special
+    You should use the associate indices to index the embeddings.
 
     Params:
         config: a OpenAIGPTConfig class instance with the configuration to build a new model
 
     Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with the word BPE token indices selected in the range [0, config.vocab_size[
-        `mc_token_mask`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with a value of 1 were the last hidden state is (usually the [CLS] token) and 0 otherwise.
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
         `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [config.vocab_size + config.n_special,
-            config.vocab_size + config.n_special + config.n_positions - 1[.
+            with the position indices (selected in the range [0, config.n_positions - 1[.
         `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third embedding (the previous two being the word and position embeddings)
-            to each token in the sentence.
+            You can use it to add a third type of embedding to each input token in the sequence
+            (the previous two being the word and position embeddings).
+            The input, position and token_type embeddings are summed inside the Transformer before the first
+            self-attention block.
         `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with indices selected in [-1, 0, ..., total_num_embeddings]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., total_num_embeddings]
+            with indices selected in [-1, 0, ..., total_tokens_embeddings]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., total_tokens_embeddings]
         `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
             with indices selected in [0, ..., num_choices].
 
@@ -772,7 +764,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         if `lm_labels` and `multiple_choice_labels` are not `None`:
             Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
         else: a tuple with
-            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_num_embeddings]
+            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings]
             `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
 
     Example usage:

From 05f961840b0901b3689de20d7b18ed07b24be5e1 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 4 Feb 2019 13:06:19 +0100
Subject: [PATCH 27/82] logging

---
 .../tokenization_openai.py                    | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index e5e4dbda39..a12e58721b 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -125,16 +125,19 @@ class OpenAIGPTTokenizer(object):
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
-        if not special_tokens:
-            self.special_tokens = {}
-        else:
-            self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
+        self.set_special_tokens(special_tokens)
 
     def __len__(self):
         return len(self.encoder) + len(self.special_tokens)
 
     def set_special_tokens(self, special_tokens):
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
         self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
+        logger.info("Special tokens {}".format(self.special_tokens))
 
     def bpe(self, token):
         word = tuple(token[:-1]) + ( token[-1] + '</w>',)
@@ -189,6 +192,11 @@ class OpenAIGPTTokenizer(object):
     def convert_tokens_to_ids(self, tokens):
         """Converts a sequence of tokens into ids using the vocab."""
         ids = []
+        if isinstance(tokens, str):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
         for token in tokens:
             if token in self.special_tokens:
                 ids.append(self.special_tokens[token])
@@ -206,7 +214,10 @@ class OpenAIGPTTokenizer(object):
         """Converts a sequence of ids in BPE tokens using the vocab."""
         tokens = []
         for i in ids:
-            tokens.append(self.decoder[i])
+            if i in self.special_tokens_decoder:
+                tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
         return tokens
 
     def decode(self, ids):

From 01a3966bc6d265aa8c7088b39bfdc20a905a2c74 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 4 Feb 2019 17:26:25 +0100
Subject: [PATCH 28/82] more options on special tokens

---
 pytorch_pretrained_bert/tokenization_openai.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index a12e58721b..e545e0d375 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -131,6 +131,10 @@ class OpenAIGPTTokenizer(object):
         return len(self.encoder) + len(self.special_tokens)
 
     def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
         if not special_tokens:
             self.special_tokens = {}
             self.special_tokens_decoder = {}
@@ -210,18 +214,19 @@ class OpenAIGPTTokenizer(object):
             )
         return ids
 
-    def convert_ids_to_tokens(self, ids):
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
         """Converts a sequence of ids in BPE tokens using the vocab."""
         tokens = []
         for i in ids:
             if i in self.special_tokens_decoder:
-                tokens.append(self.special_tokens_decoder[i])
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
             else:
                 tokens.append(self.decoder[i])
         return tokens
 
-    def decode(self, ids):
+    def decode(self, ids, skip_special_tokens=False):
         """Converts a sequence of ids in a string."""
-        tokens = self.convert_ids_to_tokens(ids)
+        tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
         out_string = ''.join(tokens).replace('</w>', ' ')
         return out_string

From 850da1cc36f95175219420365ac3fb95b483ce8d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 4 Feb 2019 17:35:05 +0100
Subject: [PATCH 29/82] strip decoded outputs

---
 pytorch_pretrained_bert/tokenization_openai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index e545e0d375..aba531caed 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -228,5 +228,5 @@ class OpenAIGPTTokenizer(object):
     def decode(self, ids, skip_special_tokens=False):
         """Converts a sequence of ids in a string."""
         tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
-        out_string = ''.join(tokens).replace('</w>', ' ')
+        out_string = ''.join(tokens).replace('</w>', ' ').strip()
         return out_string

From 6179f537a3e2c8db472bce964d4f4cb6fdc09204 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 4 Feb 2019 17:41:22 +0100
Subject: [PATCH 30/82] clean up tokenization spaces

---
 pytorch_pretrained_bert/tokenization_openai.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index aba531caed..616b68db59 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -225,8 +225,14 @@ class OpenAIGPTTokenizer(object):
                 tokens.append(self.decoder[i])
         return tokens
 
-    def decode(self, ids, skip_special_tokens=False):
+    def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=False):
         """Converts a sequence of ids in a string."""
         tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
         out_string = ''.join(tokens).replace('</w>', ' ').strip()
+        if clean_up_tokenization_spaces:
+            out_string = out_string.replace('<unk>', '')
+            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ','
+                    ).replace(" n't", "n't").replace(" 'm", "'m").replace(" 're", "'re").replace(" do not", " don't"
+                    ).replace(" 's", "'s").replace(" t ", "'t ").replace(" s ", "'s ").replace(" m ", "'m "
+                    ).replace(" 've", "'ve")
         return out_string

From ba37ddc5ced98c97e2e3e21018dda14a84e3d257 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 6 Feb 2019 00:07:08 +0100
Subject: [PATCH 31/82] fix run_lm_modeling example command line

---
 README.md | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index b124585bbe..68de1aa1aa 100644
--- a/README.md
+++ b/README.md
@@ -680,14 +680,15 @@ Training one epoch on this corpus takes about 1:20h on 4 x NVIDIA Tesla P100 wit
 
 ```shell
 python run_lm_finetuning.py \
-  --bert_model bert-base-cased 
-  --do_train 
-  --train_file samples/sample_text.txt 
-  --output_dir models 
-  --num_train_epochs 5.0 
-  --learning_rate 3e-5 
-  --train_batch_size 32 
-  --max_seq_length 128 
+  --bert_model bert-base-uncased \
+  --do_lower_case \
+  --do_train \
+  --train_file ../samples/sample_text.txt \
+  --output_dir models \
+  --num_train_epochs 5.0 \
+  --learning_rate 3e-5 \
+  --train_batch_size 32 \
+  --max_seq_length 128 \
 ```
 
 ## Fine-tuning BERT-large on GPUs

From 448937c00de4e2350e7467ec1bcb69966a855377 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 6 Feb 2019 00:07:46 +0100
Subject: [PATCH 32/82] python 2 compatibility

---
 examples/eval_transfo_xl.py                   | 32 ++++----
 examples/run_classifier.py                    | 30 ++++---
 examples/run_lm_finetuning.py                 | 43 +++++-----
 examples/run_squad.py                         | 30 ++++---
 examples/run_swag.py                          | 56 +++++++------
 pytorch_pretrained_bert/__main__.py           |  4 +-
 .../convert_openai_checkpoint_to_pytorch.py   | 12 ++-
 ...onvert_transfo_xl_checkpoint_to_pytorch.py | 26 +++---
 pytorch_pretrained_bert/file_utils.py         | 81 ++++++++++---------
 pytorch_pretrained_bert/modeling.py           | 31 ++++---
 pytorch_pretrained_bert/modeling_openai.py    | 10 ++-
 .../modeling_transfo_xl.py                    | 19 +++--
 pytorch_pretrained_bert/tokenization.py       | 11 ++-
 .../tokenization_openai.py                    | 16 ++--
 .../tokenization_transfo_xl.py                | 21 +++--
 setup.py                                      |  3 +-
 tests/tokenization_test.py                    |  5 +-
 17 files changed, 246 insertions(+), 184 deletions(-)

diff --git a/examples/eval_transfo_xl.py b/examples/eval_transfo_xl.py
index e67efd3a68..3326454ea1 100644
--- a/examples/eval_transfo_xl.py
+++ b/examples/eval_transfo_xl.py
@@ -17,26 +17,26 @@
     Adapted from https://github.com/kimiyoung/transformer-xl.
     In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
 """
+from __future__ import absolute_import, division, print_function, unicode_literals
+
 import os
-import sys
 import functools
 import argparse
+import logging
 import time
 import math
+import sys
+from io import open
 
 import torch
 
 from pytorch_pretrained_bert import TransfoXLModel, TransfoXLCorpus
 
-def logging(s, log_path, print_=True, log_=True):
-    if print_:
-        print(s)
-    if log_:
-        with open(log_path, 'a+') as f_log:
-            f_log.write(s + '\n')
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
 
-def get_logger(log_path, **kwargs):
-    return functools.partial(logging, log_path=log_path, **kwargs)
 
 parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
 # parser.add_argument('--data', type=str, default='../data/wikitext-103',
@@ -71,8 +71,8 @@ assert args.ext_len >= 0, 'extended context length must be non-negative'
 device = torch.device("cuda" if args.cuda else "cpu")
 
 # Get logger
-logging = get_logger(os.path.join(args.work_dir, 'log.txt'),
-                     log_=not args.no_log)
+# logging = get_logger(os.path.join(args.work_dir, 'log.txt'),
+#                      log_=not args.no_log)
 
 # Load dataset
 corpus = TransfoXLCorpus.from_pretrained(args.model_name)
@@ -90,7 +90,7 @@ te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
 model = TransfoXLModel.from_pretrained(args.model_name)
 model = model.to(device)
 
-logging('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
+logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
        args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
 
 model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
@@ -116,7 +116,7 @@ def evaluate(eval_iter):
             total_loss += seq_len * loss.item()
             total_len += seq_len
         total_time = time.time() - start_time
-    logging('Time : {:.2f}s, {:.2f}ms/segment'.format(
+    logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
             total_time, 1000 * total_time / (idx+1)))
     return total_loss / total_len
 
@@ -146,6 +146,6 @@ if valid_loss is not None:
 if test_loss is not None:
     log_str += format_log(test_loss, 'test')
 
-logging('=' * 100)
-logging(log_str)
-logging('=' * 100)
+logger.info('=' * 100)
+logger.info(log_str)
+logger.info('=' * 100)
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index 31877a5414..87245fa560 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -15,26 +15,27 @@
 # limitations under the License.
 """BERT finetuning runner."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
-import csv
-import os
-import logging
 import argparse
+import csv
+import logging
+import os
 import random
-from tqdm import tqdm, trange
+import sys
+from io import open
 
 import numpy as np
 import torch
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
 
-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 from pytorch_pretrained_bert.modeling import BertForSequenceClassification
 from pytorch_pretrained_bert.optimization import BertAdam
-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from pytorch_pretrained_bert.tokenization import BertTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
@@ -91,10 +92,12 @@ class DataProcessor(object):
     @classmethod
     def _read_tsv(cls, input_file, quotechar=None):
         """Reads a tab separated value file."""
-        with open(input_file, "r", encoding='utf-8') as f:
+        with open(input_file, "rb") as f:
             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
             lines = []
             for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
                 lines.append(line)
             return lines
 
@@ -429,7 +432,8 @@ def main():
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
         raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    os.makedirs(args.output_dir, exist_ok=True)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
 
     task_name = args.task_name.lower()
 
@@ -451,7 +455,7 @@ def main():
 
     # Prepare model
     model = BertForSequenceClassification.from_pretrained(args.bert_model,
-              cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank),
+              cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)),
               num_labels = num_labels)
     if args.fp16:
         model.half()
diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index 39df2e99f8..6bd82b4ef5 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -15,26 +15,23 @@
 # limitations under the License.
 """BERT finetuning runner."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
 
-import os
-import logging
 import argparse
-from tqdm import tqdm, trange
+import logging
+import os
+import random
+from io import open
 
 import numpy as np
 import torch
-from torch.utils.data import DataLoader, RandomSampler
+from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
 
-from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.modeling import BertForPreTraining
 from pytorch_pretrained_bert.optimization import BertAdam
-
-from torch.utils.data import Dataset
-import random
+from pytorch_pretrained_bert.tokenization import BertTokenizer
 
 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt='%m/%d/%Y %H:%M:%S',
@@ -185,16 +182,16 @@ class BERTDataset(Dataset):
             if self.line_buffer is None:
                 # read first non-empty line of file
                 while t1 == "" :
-                    t1 = self.file.__next__().strip()
-                    t2 = self.file.__next__().strip()
+                    t1 = next(self.file).strip()
+                    t2 = next(self.file).strip()
             else:
                 # use t2 from previous iteration as new t1
                 t1 = self.line_buffer
-                t2 = self.file.__next__().strip()
+                t2 = next(self.file).strip()
                 # skip empty rows that are used for separating documents and keep track of current doc id
                 while t2 == "" or t1 == "":
-                    t1 = self.file.__next__().strip()
-                    t2 = self.file.__next__().strip()
+                    t1 = next(self.file).strip()
+                    t2 = next(self.file).strip()
                     self.current_doc = self.current_doc+1
             self.line_buffer = t2
 
@@ -228,15 +225,15 @@ class BERTDataset(Dataset):
     def get_next_line(self):
         """ Gets next line of random_file and starts over when reaching end of file"""
         try:
-            line = self.random_file.__next__().strip()
+            line = next(self.random_file).strip()
             #keep track of which document we are currently looking at to later avoid having the same doc as t1
             if line == "":
                 self.current_random_doc = self.current_random_doc + 1
-                line = self.random_file.__next__().strip()
+                line = next(self.random_file).strip()
         except StopIteration:
             self.random_file.close()
             self.random_file = open(self.corpus_path, "r", encoding=self.encoding)
-            line = self.random_file.__next__().strip()
+            line = next(self.random_file).strip()
         return line
 
 
@@ -425,6 +422,7 @@ def main():
                         help="The output directory where the model checkpoints will be written.")
 
     ## Other parameters
+    parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
     parser.add_argument("--max_seq_length",
                         default=128,
                         type=int,
@@ -513,7 +511,8 @@ def main():
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
         raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    os.makedirs(args.output_dir, exist_ok=True)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
 
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
 
@@ -579,7 +578,7 @@ def main():
         if args.local_rank == -1:
             train_sampler = RandomSampler(train_dataset)
         else:
-            #TODO: check if this works with current data generator from disk that relies on file.__next__
+            #TODO: check if this works with current data generator from disk that relies on next(file)
             # (it doesn't return item back by index)
             train_sampler = DistributedSampler(train_dataset)
         train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
@@ -643,4 +642,4 @@ def accuracy(out, labels):
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 245aee0ff2..a0abe1101f 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -15,29 +15,36 @@
 # limitations under the License.
 """Run BERT on SQuAD."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import argparse
 import collections
-import logging
 import json
+import logging
 import math
 import os
 import random
-import pickle
-from tqdm import tqdm, trange
+import sys
+from io import open
 
 import numpy as np
 import torch
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
 
-from pytorch_pretrained_bert.tokenization import whitespace_tokenize, BasicTokenizer, BertTokenizer
+from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
 from pytorch_pretrained_bert.optimization import BertAdam
-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
+                                                  BertTokenizer,
+                                                  whitespace_tokenize)
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
@@ -784,7 +791,8 @@ def main():
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
         raise ValueError("Output directory () already exists and is not empty.")
-    os.makedirs(args.output_dir, exist_ok=True)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
 
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
 
@@ -798,7 +806,7 @@ def main():
 
     # Prepare model
     model = BertForQuestionAnswering.from_pretrained(args.bert_model,
-                cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank))
+                cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)))
 
     if args.fp16:
         model.half()
diff --git a/examples/run_swag.py b/examples/run_swag.py
index 3fb87ae3e7..9c1fa0759b 100644
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -15,22 +15,25 @@
 # limitations under the License.
 """BERT finetuning runner."""
 
+import argparse
+import csv
 import logging
 import os
-import argparse
 import random
-from tqdm import tqdm, trange
-import csv
+import sys
+from io import open
 
 import numpy as np
 import torch
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
 
-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 from pytorch_pretrained_bert.modeling import BertForMultipleChoice
 from pytorch_pretrained_bert.optimization import BertAdam
-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from pytorch_pretrained_bert.tokenization import BertTokenizer
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
@@ -65,17 +68,17 @@ class SwagExample(object):
 
     def __repr__(self):
         l = [
-            f"swag_id: {self.swag_id}",
-            f"context_sentence: {self.context_sentence}",
-            f"start_ending: {self.start_ending}",
-            f"ending_0: {self.endings[0]}",
-            f"ending_1: {self.endings[1]}",
-            f"ending_2: {self.endings[2]}",
-            f"ending_3: {self.endings[3]}",
+            "swag_id: {}".format(self.swag_id),
+            "context_sentence: {}".format(self.context_sentence),
+            "start_ending: {}".format(self.start_ending),
+            "ending_0: {}".format(self.endings[0]),
+            "ending_1: {}".format(self.endings[1]),
+            "ending_2: {}".format(self.endings[2]),
+            "ending_3: {}".format(self.endings[3]),
         ]
 
         if self.label is not None:
-            l.append(f"label: {self.label}")
+            l.append("label: {}".format(self.label))
 
         return ", ".join(l)
 
@@ -102,7 +105,11 @@ class InputFeatures(object):
 def read_swag_examples(input_file, is_training):
     with open(input_file, 'r', encoding='utf-8') as f:
         reader = csv.reader(f)
-        lines = list(reader)
+        lines = []
+        for line in reader:
+            if sys.version_info[0] == 2:
+                line = list(unicode(cell, 'utf-8') for cell in line)
+            lines.append(line)
 
     if is_training and lines[0][-1] != 'label':
         raise ValueError(
@@ -184,15 +191,15 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
         label = example.label
         if example_index < 5:
             logger.info("*** Example ***")
-            logger.info(f"swag_id: {example.swag_id}")
+            logger.info("swag_id: {}".format(example.swag_id))
             for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
-                logger.info(f"choice: {choice_idx}")
-                logger.info(f"tokens: {' '.join(tokens)}")
-                logger.info(f"input_ids: {' '.join(map(str, input_ids))}")
-                logger.info(f"input_mask: {' '.join(map(str, input_mask))}")
-                logger.info(f"segment_ids: {' '.join(map(str, segment_ids))}")
+                logger.info("choice: {}".format(choice_idx))
+                logger.info("tokens: {}".format(' '.join(tokens)))
+                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
+                logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
+                logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
             if is_training:
-                logger.info(f"label: {label}")
+                logger.info("label: {}".format(label))
 
         features.append(
             InputFeatures(
@@ -349,7 +356,8 @@ def main():
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
         raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    os.makedirs(args.output_dir, exist_ok=True)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
 
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
 
@@ -362,7 +370,7 @@ def main():
 
     # Prepare model
     model = BertForMultipleChoice.from_pretrained(args.bert_model,
-        cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank),
+        cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)),
         num_choices=4)
     if args.fp16:
         model.half()
diff --git a/pytorch_pretrained_bert/__main__.py b/pytorch_pretrained_bert/__main__.py
index 3adb4ac36a..d3db22db60 100644
--- a/pytorch_pretrained_bert/__main__.py
+++ b/pytorch_pretrained_bert/__main__.py
@@ -15,7 +15,7 @@ def main():
         if sys.argv[1] == "convert_tf_checkpoint_to_pytorch":
             try:
                 import tensorflow as tf
-            except ModuleNotFoundError:
+            except ImportError:
                 print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
@@ -43,7 +43,7 @@ def main():
         else:
             try:
                 import tensorflow as tf
-            except ModuleNotFoundError:
+            except ImportError:
                 print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
                     "https://www.tensorflow.org/install/ for installation instructions.")
diff --git a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
index 3f3e48172b..bb665fc438 100755
--- a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
@@ -14,14 +14,18 @@
 # limitations under the License.
 """Convert OpenAI GPT checkpoint."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
 import argparse
+from io import open
+
 import torch
 
-from pytorch_pretrained_bert.modeling_openai import load_tf_weights_in_openai_gpt, OpenAIGPTConfig, OpenAIGPTModel, CONFIG_NAME, WEIGHTS_NAME
+from pytorch_pretrained_bert.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
+                                                     OpenAIGPTConfig,
+                                                     OpenAIGPTModel,
+                                                     load_tf_weights_in_openai_gpt)
+
 
 def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
     # Construct model
diff --git a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
index 594d01bfa3..dedea33435 100755
--- a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -14,25 +14,31 @@
 # limitations under the License.
 """Convert Transformer XL checkpoint and datasets."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function
 
+import argparse
 import os
 import sys
-import argparse
-import pickle
+from io import open
 
-import tensorflow as tf
 import torch
-import numpy as np
 
-from pytorch_pretrained_bert.modeling_transfo_xl import TransfoXLConfig, TransfoXLModel, CONFIG_NAME, WEIGHTS_NAME, load_tf_weights_in_transfo_xl
-from pytorch_pretrained_bert.tokenization_transfo_xl import VOCAB_NAME, CORPUS_NAME
+import pytorch_pretrained_bert.tokenization_transfo_xl as data_utils
+from pytorch_pretrained_bert.modeling_transfo_xl import (CONFIG_NAME,
+                                                         WEIGHTS_NAME,
+                                                         TransfoXLConfig,
+                                                         TransfoXLModel,
+                                                         load_tf_weights_in_transfo_xl)
+from pytorch_pretrained_bert.tokenization_transfo_xl import (CORPUS_NAME,
+                                                             VOCAB_NAME)
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
 
 # We do this to be able to load the python 2 datasets pickles
 # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
-import pytorch_pretrained_bert.tokenization_transfo_xl as data_utils
 data_utils.Vocab = data_utils.TransfoXLTokenizer
 data_utils.Corpus = data_utils.TransfoXLCorpus
 sys.modules['data_utils'] = data_utils
diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py
index 43fa8ca87e..0b5fc2c217 100644
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -3,31 +3,39 @@ Utilities for working with the local dataset cache.
 This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
 Copyright by the AllenNLP authors.
 """
+from __future__ import (absolute_import, division, print_function, unicode_literals)
 
-import os
+import json
 import logging
+import os
 import shutil
 import tempfile
-import json
-from urllib.parse import urlparse
-from pathlib import Path
-from typing import Optional, Tuple, Union, IO, Callable, Set
-from hashlib import sha256
 from functools import wraps
-
-from tqdm import tqdm
+from hashlib import sha256
+from io import open
 
 import boto3
-from botocore.exceptions import ClientError
 import requests
+from botocore.exceptions import ClientError
+from tqdm import tqdm
+
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+
+try:
+    from pathlib import Path
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                                   Path.home() / '.pytorch_pretrained_bert'))
+except ImportError:
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                              os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
-PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
-                                               Path.home() / '.pytorch_pretrained_bert'))
 
-
-def url_to_filename(url: str, etag: str = None) -> str:
+def url_to_filename(url, etag=None):
     """
     Convert `url` into a hashed filename in a repeatable way.
     If `etag` is specified, append its hash to the url's, delimited
@@ -45,25 +53,23 @@ def url_to_filename(url: str, etag: str = None) -> str:
     return filename
 
 
-def filename_to_url(filename: str, cache_dir: Union[str, Path] = None) -> Tuple[str, str]:
+def filename_to_url(filename, cache_dir=None):
     """
     Return the url and etag (which may be ``None``) stored for `filename`.
-    Raise ``FileNotFoundError`` if `filename` or its stored metadata do not exist.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
     """
     if cache_dir is None:
         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
 
     cache_path = os.path.join(cache_dir, filename)
     if not os.path.exists(cache_path):
-        raise FileNotFoundError("file {} not found".format(cache_path))
+        raise EnvironmentError("file {} not found".format(cache_path))
 
     meta_path = cache_path + '.json'
     if not os.path.exists(meta_path):
-        raise FileNotFoundError("file {} not found".format(meta_path))
+        raise EnvironmentError("file {} not found".format(meta_path))
 
-    with open(meta_path) as meta_file:
+    with open(meta_path, encoding="utf-8") as meta_file:
         metadata = json.load(meta_file)
     url = metadata['url']
     etag = metadata['etag']
@@ -71,7 +77,7 @@ def filename_to_url(filename: str, cache_dir: Union[str, Path] = None) -> Tuple[
     return url, etag
 
 
-def cached_path(url_or_filename: Union[str, Path], cache_dir: Union[str, Path] = None) -> str:
+def cached_path(url_or_filename, cache_dir=None):
     """
     Given something that might be a URL (or might be a local path),
     determine which. If it's a URL, download the file and cache it, and
@@ -80,10 +86,6 @@ def cached_path(url_or_filename: Union[str, Path], cache_dir: Union[str, Path] =
     """
     if cache_dir is None:
         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
-    if isinstance(url_or_filename, Path):
-        url_or_filename = str(url_or_filename)
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
 
     parsed = urlparse(url_or_filename)
 
@@ -95,13 +97,13 @@ def cached_path(url_or_filename: Union[str, Path], cache_dir: Union[str, Path] =
         return url_or_filename
     elif parsed.scheme == '':
         # File, but it doesn't exist.
-        raise FileNotFoundError("file {} not found".format(url_or_filename))
+        raise EnvironmentError("file {} not found".format(url_or_filename))
     else:
         # Something unknown
         raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
 
 
-def split_s3_path(url: str) -> Tuple[str, str]:
+def split_s3_path(url):
     """Split a full s3 path into the bucket name and path."""
     parsed = urlparse(url)
     if not parsed.netloc or not parsed.path:
@@ -114,19 +116,19 @@ def split_s3_path(url: str) -> Tuple[str, str]:
     return bucket_name, s3_path
 
 
-def s3_request(func: Callable):
+def s3_request(func):
     """
     Wrapper function for s3 requests in order to create more helpful error
     messages.
     """
 
     @wraps(func)
-    def wrapper(url: str, *args, **kwargs):
+    def wrapper(url, *args, **kwargs):
         try:
             return func(url, *args, **kwargs)
         except ClientError as exc:
             if int(exc.response["Error"]["Code"]) == 404:
-                raise FileNotFoundError("file {} not found".format(url))
+                raise EnvironmentError("file {} not found".format(url))
             else:
                 raise
 
@@ -134,7 +136,7 @@ def s3_request(func: Callable):
 
 
 @s3_request
-def s3_etag(url: str) -> Optional[str]:
+def s3_etag(url):
     """Check ETag on S3 object."""
     s3_resource = boto3.resource("s3")
     bucket_name, s3_path = split_s3_path(url)
@@ -143,14 +145,14 @@ def s3_etag(url: str) -> Optional[str]:
 
 
 @s3_request
-def s3_get(url: str, temp_file: IO) -> None:
+def s3_get(url, temp_file):
     """Pull a file directly from S3."""
     s3_resource = boto3.resource("s3")
     bucket_name, s3_path = split_s3_path(url)
     s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
 
 
-def http_get(url: str, temp_file: IO) -> None:
+def http_get(url, temp_file):
     req = requests.get(url, stream=True)
     content_length = req.headers.get('Content-Length')
     total = int(content_length) if content_length is not None else None
@@ -162,17 +164,16 @@ def http_get(url: str, temp_file: IO) -> None:
     progress.close()
 
 
-def get_from_cache(url: str, cache_dir: Union[str, Path] = None) -> str:
+def get_from_cache(url, cache_dir=None):
     """
     Given a URL, look for the corresponding dataset in the local cache.
     If it's not there, download it. Then return the path to the cached file.
     """
     if cache_dir is None:
         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
 
-    os.makedirs(cache_dir, exist_ok=True)
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
 
     # Get eTag to add to filename, if it exists.
     if url.startswith("s3://"):
@@ -213,7 +214,7 @@ def get_from_cache(url: str, cache_dir: Union[str, Path] = None) -> str:
             logger.info("creating metadata file for %s", cache_path)
             meta = {'url': url, 'etag': etag}
             meta_path = cache_path + '.json'
-            with open(meta_path, 'w') as meta_file:
+            with open(meta_path, 'w', encoding="utf-8") as meta_file:
                 json.dump(meta, meta_file)
 
             logger.info("removing temp file %s", temp_file.name)
@@ -221,7 +222,7 @@ def get_from_cache(url: str, cache_dir: Union[str, Path] = None) -> str:
     return cache_path
 
 
-def read_set_from_file(filename: str) -> Set[str]:
+def read_set_from_file(filename):
     '''
     Extract a de-duped collection (set) of text from a file.
     Expected file format is one item per line.
@@ -233,7 +234,7 @@ def read_set_from_file(filename: str) -> Set[str]:
     return collection
 
 
-def get_file_extension(path: str, dot=True, lower: bool = True):
+def get_file_extension(path, dot=True, lower=True):
     ext = os.path.splitext(path)[1]
     ext = ext if dot else ext[1:]
     return ext.lower() if lower else ext
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 7f0c2bd47b..72d0f602d6 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -15,18 +15,18 @@
 # limitations under the License.
 """PyTorch BERT model."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
 
-import os
 import copy
 import json
-import math
 import logging
+import math
+import os
+import shutil
 import tarfile
 import tempfile
-import shutil
+import sys
+from io import open
 
 import torch
 from torch import nn
@@ -56,7 +56,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         import re
         import numpy as np
         import tensorflow as tf
-    except ModuleNotFoundError:
+    except ImportError:
         print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
             "https://www.tensorflow.org/install/ for installation instructions.")
         raise
@@ -164,7 +164,8 @@ class BertConfig(object):
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
         """
-        if isinstance(vocab_size_or_config_json_file, str):
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
                 json_config = json.loads(reader.read())
             for key, value in json_config.items():
@@ -343,8 +344,10 @@ class BertIntermediate(nn.Module):
     def __init__(self, config):
         super(BertIntermediate, self).__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        self.intermediate_act_fn = ACT2FN[config.hidden_act] \
-            if isinstance(config.hidden_act, str) else config.hidden_act
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states):
         hidden_states = self.dense(hidden_states)
@@ -416,8 +419,10 @@ class BertPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super(BertPredictionHeadTransform, self).__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.transform_act_fn = ACT2FN[config.hidden_act] \
-            if isinstance(config.hidden_act, str) else config.hidden_act
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 
     def forward(self, hidden_states):
@@ -542,7 +547,7 @@ class BertPreTrainedModel(nn.Module):
         # redirect to the cache, if necessary
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-        except FileNotFoundError:
+        except EnvironmentError:
             logger.error(
                 "Model name '{}' was not found in model name list ({}). "
                 "We assumed '{}' was a path or url but couldn't find any file "
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 14d5cf7ef2..11325259fb 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -24,6 +24,8 @@ import os
 import shutil
 import tarfile
 import tempfile
+import sys
+from io import open
 
 import torch
 import torch.nn as nn
@@ -160,7 +162,8 @@ class OpenAIGPTConfig(object):
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
         """
-        if isinstance(vocab_size_or_config_json_file, str):
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
                 json_config = json.loads(reader.read())
             for key, value in json_config.items():
@@ -442,7 +445,7 @@ class OpenAIGPTPreTrainedModel(nn.Module):
         # redirect to the cache, if necessary
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-        except FileNotFoundError:
+        except EnvironmentError:
             logger.error(
                 "Model name '{}' was not found in model name list ({}). "
                 "We assumed '{}' was a path or url but couldn't find any file "
@@ -641,7 +644,8 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         hidden_states = inputs_embeds + position_embeds + token_type_embeds
         for block in self.h:
             hidden_states = block(hidden_states)
-        return hidden_states.view(*input_shape, hidden_states.size(-1))
+        output_shape = input_shape + (hidden_states.size(-1),)
+        return hidden_states.view(*output_shape)
 
 
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 204eef738c..2db8a964ac 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -27,6 +27,8 @@ import tarfile
 import tempfile
 import shutil
 import collections
+import sys
+from io import open
 
 import torch
 import torch.nn as nn
@@ -124,7 +126,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
     try:
         import numpy as np
         import tensorflow as tf
-    except ModuleNotFoundError:
+    except ImportError:
         print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
             "https://www.tensorflow.org/install/ for installation instructions.")
         raise
@@ -239,7 +241,8 @@ class TransfoXLConfig(object):
             proj_init_std: parameters initialized by N(0, init_std)
             init_std: parameters initialized by N(0, init_std)
         """
-        if isinstance(vocab_size_or_config_json_file, str):
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
                 json_config = json.loads(reader.read())
             for key, value in json_config.items():
@@ -503,11 +506,12 @@ class RelMultiHeadAttn(nn.Module):
         return x
 
     def _rel_shift(self, x, zero_triu=False):
-        zero_pad = torch.zeros((x.size(0), 1, *x.size()[2:]),
-                               device=x.device, dtype=x.dtype)
+        zero_pad_shape = (x.size(0), 1) + x.size()[2:]
+        zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype)
         x_padded = torch.cat([zero_pad, x], dim=1)
 
-        x_padded = x_padded.view(x.size(1) + 1, x.size(0), *x.size()[2:])
+        x_padded_shape = (x.size(1) + 1, x.size(0)) + x.size()[2:]
+        x_padded = x_padded.view(*x_padded_shape)
 
         x = x_padded[1:].view_as(x)
 
@@ -797,7 +801,8 @@ class AdaptiveEmbedding(nn.Module):
 
                 emb_flat.index_copy_(0, indices_i, emb_i)
 
-            embed = emb_flat.view(*inp.size(), self.d_proj)
+            embed_shape = inp.size() + (self.d_proj,)
+            embed = emb_flat.view(embed_shape)
 
         embed.mul_(self.emb_scale)
 
@@ -905,7 +910,7 @@ class TransfoXLPreTrainedModel(nn.Module):
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
             resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
-        except FileNotFoundError:
+        except EnvironmentError:
             logger.error(
                 "Model name '{}' was not found in model name list ({}). "
                 "We assumed '{}' was a path or url but couldn't find files {} and {} "
diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py
index 9cfb3d8ce9..3527fd8594 100644
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -14,14 +14,13 @@
 # limitations under the License.
 """Tokenization classes."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import collections
-import unicodedata
-import os
 import logging
+import os
+import unicodedata
+from io import open
 
 from .file_utils import cached_path
 
@@ -129,7 +128,7 @@ class BertTokenizer(object):
         # redirect to the cache, if necessary
         try:
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-        except FileNotFoundError:
+        except EnvironmentError:
             logger.error(
                 "Model name '{}' was not found in model name list ({}). "
                 "We assumed '{}' was a path or url but couldn't find any file "
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index 616b68db59..55ac2bc892 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -13,11 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import json
+import logging
 import os
 import re
-import json
+import sys
+from io import open
+
 from tqdm import tqdm
-import logging
 
 from .file_utils import cached_path
 
@@ -82,7 +88,7 @@ class OpenAIGPTTokenizer(object):
         try:
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
             resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
-        except FileNotFoundError:
+        except EnvironmentError:
             logger.error(
                 "Model name '{}' was not found in model name list ({}). "
                 "We assumed '{}' was a path or url but couldn't find files {} and {} "
@@ -119,7 +125,7 @@ class OpenAIGPTTokenizer(object):
         self.max_len = max_len if max_len is not None else int(1e12)
         self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
         self.fix_text = ftfy.fix_text
-        self.encoder = json.load(open(vocab_file))
+        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
         self.decoder = {v:k for k,v in self.encoder.items()}
         merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
         merges = [tuple(merge.split()) for merge in merges]
@@ -196,7 +202,7 @@ class OpenAIGPTTokenizer(object):
     def convert_tokens_to_ids(self, tokens):
         """Converts a sequence of tokens into ids using the vocab."""
         ids = []
-        if isinstance(tokens, str):
+        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
             if tokens in self.special_tokens:
                 return self.special_tokens[tokens]
             else:
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py
index db626f7755..860b274f19 100644
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@@ -16,16 +16,27 @@
 """ Tokenization classes for Transformer XL model.
     Adapted from https://github.com/kimiyoung/transformer-xl.
 """
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
 
-import os
 import glob
 import logging
-import pickle
-import torch
+import os
+import sys
 from collections import Counter, OrderedDict
+from io import open
+
+import torch
+import numpy as np
 
 from .file_utils import cached_path
 
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+
 logger = logging.getLogger(__name__)
 
 PRETRAINED_VOCAB_ARCHIVE_MAP = {
@@ -55,7 +66,7 @@ class TransfoXLTokenizer(object):
         # redirect to the cache, if necessary
         try:
             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-        except FileNotFoundError:
+        except EnvironmentError:
             logger.error(
                 "Model name '{}' was not found in model name list ({}). "
                 "We assumed '{}' was a path or url but couldn't find files {} "
@@ -422,7 +433,7 @@ class TransfoXLCorpus(object):
         # redirect to the cache, if necessary
         try:
             resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir)
-        except FileNotFoundError:
+        except EnvironmentError:
             logger.error(
                 "Model name '{}' was not found in model name list ({}). "
                 "We assumed '{}' was a path or url but couldn't find files {} "
diff --git a/setup.py b/setup.py
index cf2f906100..57e27f0d2a 100644
--- a/setup.py
+++ b/setup.py
@@ -33,6 +33,7 @@ To create the package for pypi.
 7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
 
 """
+from io import open
 from setuptools import find_packages, setup
 
 setup(
@@ -58,7 +59,7 @@ setup(
         "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main",
       ]
     },
-    python_requires='>=3.5.0',
+    # python_requires='>=3.5.0',
     tests_require=['pytest'],
     classifiers=[
           'Intended Audience :: Science/Research',
diff --git a/tests/tokenization_test.py b/tests/tokenization_test.py
index e1474e938b..d4699adadb 100644
--- a/tests/tokenization_test.py
+++ b/tests/tokenization_test.py
@@ -18,6 +18,7 @@ from __future__ import print_function
 
 import os
 import unittest
+from io import open
 
 from pytorch_pretrained_bert.tokenization import (BertTokenizer, BasicTokenizer, WordpieceTokenizer,
                                                   _is_whitespace, _is_control, _is_punctuation)
@@ -30,7 +31,7 @@ class TokenizationTest(unittest.TestCase):
             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
             "##ing", ","
         ]
-        with open("/tmp/bert_tokenizer_test.txt", "w") as vocab_writer:
+        with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
             vocab_file = vocab_writer.name
@@ -49,7 +50,7 @@ class TokenizationTest(unittest.TestCase):
             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
             "##ing", ","
         ]
-        with open("/tmp/bert_tokenizer_test.txt", "w") as vocab_writer:
+        with open("/tmp/bert_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
             vocab_file = vocab_writer.name
 

From 34bdb7f9cb82593a64a7e94bf26325f3ba35f0d8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 6 Feb 2019 00:25:12 +0100
Subject: [PATCH 33/82] update circle-ci for python 2.7 and 3.5

---
 .circleci/config.yml | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 2c8f906aba..0efb5f4b0b 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,11 +1,26 @@
 version: 2
 jobs:
-    build:
+    build_py3:
         working_directory: ~/pytorch-pretrained-BERT
         docker:
-            - image: circleci/python:3.7
+            - image: circleci/python:3.5
         steps:
             - checkout
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest
             - run: python -m pytest -sv tests/
+    build_py2:
+        working_directory: ~/pytorch-pretrained-BERT
+        docker:
+            - image: circleci/python:2.7
+        steps:
+            - checkout
+            - run: sudo pip install --progress-bar off .
+            - run: sudo pip install pytest
+            - run: python -m pytest -sv tests/
+workflows:
+  version: 2
+  build_and_test:
+    jobs:
+      - build_py3
+      - build_py2
\ No newline at end of file

From ba9e4eb3541837bf32442cc0deb5a8a7c7961f3b Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 6 Feb 2019 00:28:00 +0100
Subject: [PATCH 34/82] fix unicode in tokenization tests

---
 tests/tokenization_test.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/tokenization_test.py b/tests/tokenization_test.py
index d4699adadb..6a14e05ae8 100644
--- a/tests/tokenization_test.py
+++ b/tests/tokenization_test.py
@@ -12,16 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import os
 import unittest
 from io import open
 
-from pytorch_pretrained_bert.tokenization import (BertTokenizer, BasicTokenizer, WordpieceTokenizer,
-                                                  _is_whitespace, _is_control, _is_punctuation)
+from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
+                                                  BertTokenizer,
+                                                  WordpieceTokenizer,
+                                                  _is_control, _is_punctuation,
+                                                  _is_whitespace)
 
 
 class TokenizationTest(unittest.TestCase):

From 973926431ebffb844b0a5090351e905ec55bbccb Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 6 Feb 2019 15:42:29 +0100
Subject: [PATCH 35/82] fix differencies with tensorflow version (mem cells and
 adaptive sofmax clusters)

---
 .../modeling_transfo_xl.py                    | 17 ++---
 .../modeling_transfo_xl_utilities.py          | 72 ++++++++++++++++++-
 2 files changed, 75 insertions(+), 14 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 2db8a964ac..000d7ac19b 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -1088,7 +1088,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         if self.mem_len > 0:
             mems = []
             param = next(self.parameters())
-            for i in range(self.n_layer+1):
+            for i in range(self.n_layer):
                 empty = torch.zeros(self.mem_len, data.size(1), self.config.d_model,
                                     dtype=param.dtype, device=param.device)
                 mems.append(empty)
@@ -1151,15 +1151,14 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
             core_out = self.drop(word_emb)
             pos_emb = self.drop(pos_emb)
 
-            hids.append(core_out)
             for i, layer in enumerate(self.layers):
+                hids.append(core_out)
                 mems_i = None if mems is None else mems[i]
                 core_out = layer(core_out, pos_emb, dec_attn_mask=dec_attn_mask, mems=mems_i)
-                hids.append(core_out)
         elif self.attn_type == 1: # learnable
             core_out = self.drop(word_emb)
-            hids.append(core_out)
             for i, layer in enumerate(self.layers):
+                hids.append(core_out)
                 if self.clamp_len > 0:
                     r_emb = self.r_emb[i][-self.clamp_len :]
                     r_bias = self.r_bias[i][-self.clamp_len :]
@@ -1169,7 +1168,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                 mems_i = None if mems is None else mems[i]
                 core_out = layer(core_out, r_emb, self.r_w_bias[i],
                         r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i)
-                hids.append(core_out)
         elif self.attn_type == 2: # absolute
             pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device,
                                    dtype=word_emb.dtype)
@@ -1179,19 +1177,18 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
             core_out = self.drop(word_emb + pos_emb[-qlen:])
 
-            hids.append(core_out)
             for i, layer in enumerate(self.layers):
+                hids.append(core_out)
                 mems_i = None if mems is None else mems[i]
                 if mems_i is not None and i == 0:
                     mems_i += pos_emb[:mlen]
                 core_out = layer(core_out, dec_attn_mask=dec_attn_mask,
                                  mems=mems_i)
-                hids.append(core_out)
         elif self.attn_type == 3:
             core_out = self.drop(word_emb)
 
-            hids.append(core_out)
             for i, layer in enumerate(self.layers):
+                hids.append(core_out)
                 mems_i = None if mems is None else mems[i]
                 if mems_i is not None and mlen > 0:
                     cur_emb = self.r_emb[i][:-qlen]
@@ -1206,7 +1203,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
                 core_out = layer(core_out, dec_attn_mask=dec_attn_mask,
                                  mems=mems_i)
-                hids.append(core_out)
 
         core_out = self.drop(core_out)
 
@@ -1241,5 +1237,4 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         if new_mems is None:
             return [loss]
         else:
-            return [loss] + new_mems
-
+            return (loss, new_mems)
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
index 4cd04b67a7..52f80e380f 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
@@ -93,6 +93,9 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
         '''
             hidden :: [len*bsz x d_proj]
             target :: [len*bsz]
+            We could replace this implementation by the native PyTorch one
+            if their was an option to set bias on all clusters in the native one.
+            line https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
         '''
 
         if hidden.size(0) != target.size(0):
@@ -156,9 +159,9 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
 
                     tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
                     tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
-
-                    logprob_i = head_logprob_i[:, -i] \
-                              + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1)
+                    cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
+                    logprob_i = head_logprob_i[:, cluster_prob_idx] \
+                              + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1)
 
                 if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
                     nll.index_copy_(0, indices_i, -logprob_i)
@@ -169,6 +172,69 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
 
         return nll
 
+
+    def log_prob(self, hidden):
+        r""" Computes log probabilities for all :math:`n\_classes`
+        From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py
+        Args:
+            hidden (Tensor): a minibatch of examples
+        Returns:
+            log-probabilities of for each class :math:`c`
+            in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a
+            parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor.
+        Shape:
+            - Input: :math:`(N, in\_features)`
+            - Output: :math:`(N, n\_classes)`
+        """
+        if self.n_clusters == 0:
+            logit = self._compute_logit(hidden, self.out_layers[0].weight,
+                                        self.out_layers[0].bias, self.out_projs[0])
+            return F.log_softmax(logit, dim=-1)
+        else:
+            # construct weights and biases
+            weights, biases = [], []
+            for i in range(len(self.cutoffs)):
+                if self.div_val == 1:
+                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
+                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
+                else:
+                    weight_i = self.out_layers[i].weight
+                    bias_i = self.out_layers[i].bias
+
+                if i == 0:
+                    weight_i = torch.cat(
+                        [weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat(
+                        [bias_i, self.cluster_bias], dim=0)
+
+                weights.append(weight_i)
+                biases.append(bias_i)
+
+            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
+            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
+
+            out = hidden.new_empty((head_logit.size(0), self.n_token))
+            head_logprob = F.log_softmax(head_logit, dim=1)
+
+            cutoff_values = [0] + self.cutoffs
+            for i in range(len(cutoff_values) - 1):
+                start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1]
+
+                if i == 0:
+                    out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
+                else:
+                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
+
+                    tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i)
+                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
+
+                    logprob_i = head_logprob[:, -i] + tail_logprob_i
+                    out[:, start_idx, stop_idx] = logprob_i
+
+            return out
+
+
 class LogUniformSampler(object):
     def __init__(self, range_max, n_sample):
         """

From ed47cb6cbaa8fb039117b67ee2d828231b24346c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 6 Feb 2019 16:22:17 +0100
Subject: [PATCH 36/82] fixing transfo eval script

---
 examples/eval_transfo_xl.py                    | 2 +-
 pytorch_pretrained_bert/modeling_transfo_xl.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/eval_transfo_xl.py b/examples/eval_transfo_xl.py
index 3326454ea1..9a0975f186 100644
--- a/examples/eval_transfo_xl.py
+++ b/examples/eval_transfo_xl.py
@@ -111,7 +111,7 @@ def evaluate(eval_iter):
         mems = tuple()
         for idx, (data, target, seq_len) in enumerate(eval_iter):
             ret = model(data, target, *mems)
-            loss, mems = ret[0], ret[1:]
+            loss, mems = ret
             loss = loss.mean()
             total_loss += seq_len * loss.item()
             total_len += seq_len
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 000d7ac19b..53ebca6e92 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -1215,7 +1215,8 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         # So, have to initialize size(0) mems inside the model forward.
         # Moreover, have to return new_mems to allow nn.DataParallel to piece
         # them together.
-        if not mems: mems = self.init_mems(data)
+        if not mems:
+            mems = self.init_mems(data)
 
         hidden, new_mems = self._forward(data, mems=mems)
         if target is None:

From 2df41663f1f977ac1cf916e27746a228b8cb636f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 7 Feb 2019 17:05:49 +0100
Subject: [PATCH 37/82] added test

---
 tests/modeling_openai_test.py     |   3 +
 tests/modeling_test.py            |   7 +
 tests/modeling_transfo_xl_test.py | 218 ++++++++++++++++++++++++++++++
 3 files changed, 228 insertions(+)
 create mode 100644 tests/modeling_transfo_xl_test.py

diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py
index 71a1dfd3c6..81892a981a 100644
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@@ -114,6 +114,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
         def create_openai_model(self, config, input_ids, token_type_ids, position_ids,
                                 mc_labels, lm_labels, mc_token_mask):
             model = OpenAIGPTModel(config)
+            model.eval()
             hidden_states = model(input_ids, position_ids, token_type_ids)
             outputs = {
                 "hidden_states": hidden_states,
@@ -129,6 +130,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
         def create_openai_lm_head(self, config, input_ids, token_type_ids, position_ids,
                                        mc_labels, lm_labels, mc_token_mask):
             model = OpenAIGPTLMHeadModel(config)
+            model.eval()
             loss = model(input_ids, position_ids, token_type_ids, lm_labels)
             lm_logits = model(input_ids, position_ids, token_type_ids)
             outputs = {
@@ -151,6 +153,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
         def create_openai_double_heads(self, config, input_ids, token_type_ids, position_ids,
                                        mc_labels, lm_labels, mc_token_mask):
             model = OpenAIGPTDoubleHeadsModel(config)
+            model.eval()
             loss = model(input_ids, mc_token_mask,
                          lm_labels=lm_labels, mc_labels=mc_labels,
                          token_type_ids=token_type_ids, position_ids=position_ids)
diff --git a/tests/modeling_test.py b/tests/modeling_test.py
index b566512139..c7a031cfb0 100644
--- a/tests/modeling_test.py
+++ b/tests/modeling_test.py
@@ -114,6 +114,7 @@ class BertModelTest(unittest.TestCase):
 
         def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
             model = BertModel(config=config)
+            model.eval()
             all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
             outputs = {
                 "sequence_output": all_encoder_layers[-1],
@@ -134,6 +135,7 @@ class BertModelTest(unittest.TestCase):
 
         def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
             model = BertForMaskedLM(config=config)
+            model.eval()
             loss = model(input_ids, token_type_ids, input_mask, token_labels)
             prediction_scores = model(input_ids, token_type_ids, input_mask)
             outputs = {
@@ -149,6 +151,7 @@ class BertModelTest(unittest.TestCase):
 
         def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
             model = BertForNextSentencePrediction(config=config)
+            model.eval()
             loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
             seq_relationship_score = model(input_ids, token_type_ids, input_mask)
             outputs = {
@@ -165,6 +168,7 @@ class BertModelTest(unittest.TestCase):
 
         def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
             model = BertForPreTraining(config=config)
+            model.eval()
             loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
             prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask)
             outputs = {
@@ -185,6 +189,7 @@ class BertModelTest(unittest.TestCase):
 
         def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
             model = BertForQuestionAnswering(config=config)
+            model.eval()
             loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
             start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
             outputs = {
@@ -205,6 +210,7 @@ class BertModelTest(unittest.TestCase):
 
         def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
             model = BertForSequenceClassification(config=config, num_labels=self.num_labels)
+            model.eval()
             loss = model(input_ids, token_type_ids, input_mask, sequence_labels)
             logits = model(input_ids, token_type_ids, input_mask)
             outputs = {
@@ -221,6 +227,7 @@ class BertModelTest(unittest.TestCase):
 
         def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
             model = BertForTokenClassification(config=config, num_labels=self.num_labels)
+            model.eval()
             loss = model(input_ids, token_type_ids, input_mask, token_labels)
             logits = model(input_ids, token_type_ids, input_mask)
             outputs = {
diff --git a/tests/modeling_transfo_xl_test.py b/tests/modeling_transfo_xl_test.py
new file mode 100644
index 0000000000..0bc16daf4c
--- /dev/null
+++ b/tests/modeling_transfo_xl_test.py
@@ -0,0 +1,218 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import json
+import random
+
+import torch
+
+from pytorch_pretrained_bert import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
+
+
+class TransfoXLModelTest(unittest.TestCase):
+    class TransfoXLModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     mem_len=30,
+                     clamp_len=15,
+                     is_training=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     cutoffs=[10, 50, 80],
+                     d_model=32,
+                     d_embed=32,
+                     n_head=4,
+                     d_head=8,
+                     d_inner=128,
+                     div_val=2,
+                     n_layer=5,
+                     scope=None,
+                     seed=1):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.mem_len = mem_len
+            self.clamp_len = clamp_len
+            self.is_training = is_training
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.cutoffs = cutoffs
+            self.d_model = d_model
+            self.d_embed = d_embed
+            self.n_head = n_head
+            self.d_head = d_head
+            self.d_inner = d_inner
+            self.div_val = div_val
+            self.n_layer = n_layer
+            self.scope = scope
+            self.seed = seed
+
+        def prepare_config_and_inputs(self):
+            input_ids_1 = TransfoXLModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
+            input_ids_2 = TransfoXLModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
+
+            lm_labels = None
+            if self.use_labels:
+                lm_labels = TransfoXLModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
+
+            config = TransfoXLConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                mem_len=self.mem_len,
+                clamp_len=self.clamp_len,
+                cutoffs=self.cutoffs,
+                d_model=self.d_model,
+                d_embed=self.d_embed,
+                n_head=self.n_head,
+                d_head=self.d_head,
+                d_inner=self.d_inner,
+                div_val=self.div_val,
+                n_layer=self.n_layer)
+
+            return (config, input_ids_1, input_ids_2, lm_labels)
+
+        def set_seed(self):
+            random.seed(self.seed)
+            torch.manual_seed(self.seed)
+
+        def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
+            model = TransfoXLModel(config)
+            model.eval()
+
+            hidden_states_1, mems_1 = model(input_ids_1)
+            hidden_states_2, mems_2 = model(input_ids_2, mems_1)
+            outputs = {
+                "hidden_states_1": hidden_states_1,
+                "mems_1": mems_1,
+                "hidden_states_2": hidden_states_2,
+                "mems_2": mems_2,
+            }
+            return outputs
+
+        def check_transfo_xl_model_output(self, result):
+            self.parent.assertListEqual(
+                list(result["hidden_states_1"].size()),
+                [self.seq_length, self.batch_size, self.d_model])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_1"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+            self.parent.assertListEqual(
+                list(result["hidden_states_2"].size()),
+                [self.seq_length, self.batch_size, self.d_model])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_2"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+
+
+        def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
+            model = TransfoXLLMHeadModel(config)
+            model.eval()
+
+            loss_1, mems_1a = model(input_ids_1, target=lm_labels)
+            lm_logits_1, mems_1b = model(input_ids_1)
+
+            loss_2, mems_2a = model(input_ids_2, target=lm_labels, mems=mems_1a)
+            lm_logits_2, mems_2b = model(input_ids_2, mems=mems_1b)
+
+            outputs = {
+                "loss_1": loss_1,
+                "mems_1a": mems_1a,
+                "lm_logits_1": lm_logits_1,
+                "mems_1b": mems_1b,
+                "loss_2": loss_2,
+                "mems_2a": mems_2a,
+                "lm_logits_2": lm_logits_2,
+                "mems_2b": mems_2b,
+            }
+            return outputs
+
+        def check_transfo_xl_lm_head_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss_1"].size()),
+                [self.seq_length, self.batch_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_1a"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+            self.parent.assertListEqual(
+                list(result["lm_logits_1"].size()),
+                [self.seq_length, self.batch_size, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_1b"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+            self.parent.assertListEqual(
+                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1a"]),
+                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_1b"]))
+
+            self.parent.assertListEqual(
+                list(result["loss_2"].size()),
+                [self.seq_length, self.batch_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_2a"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+            self.parent.assertListEqual(
+                list(result["lm_logits_2"].size()),
+                [self.seq_length, self.batch_size, self.vocab_size])
+            self.parent.assertListEqual(
+                list(list(mem.size()) for mem in result["mems_2b"]),
+                [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
+            self.parent.assertListEqual(
+                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2a"]),
+                list(mem[~torch.isnan(mem)].sum() for mem in result["mems_2b"]))
+
+    def test_default(self):
+        self.run_tester(TransfoXLModelTest.TransfoXLModelTester(self))
+
+    def test_config_to_json_string(self):
+        config = TransfoXLConfig(vocab_size_or_config_json_file=96, d_embed=37)
+        obj = json.loads(config.to_json_string())
+        self.assertEqual(obj["n_token"], 96)
+        self.assertEqual(obj["d_embed"], 37)
+
+    def run_tester(self, tester):
+        config_and_inputs = tester.prepare_config_and_inputs()
+
+        tester.set_seed()
+        output_result = tester.create_transfo_xl_model(*config_and_inputs)
+        tester.check_transfo_xl_model_output(output_result)
+
+        tester.set_seed()
+        output_result = tester.create_transfo_xl_lm_head(*config_and_inputs)
+        tester.check_transfo_xl_lm_head_output(output_result)
+
+    @classmethod
+    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
+        """Creates a random int32 tensor of the shape within the vocab size."""
+        if rng is None:
+            rng = random.Random()
+
+        total_dims = 1
+        for dim in shape:
+            total_dims *= dim
+
+        values = []
+        for _ in range(total_dims):
+            values.append(rng.randint(0, vocab_size - 1))
+
+        return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+
+
+if __name__ == "__main__":
+    unittest.main()

From 9c3c24800bb8ff28bba032b57565db055718c4b1 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 7 Feb 2019 17:06:17 +0100
Subject: [PATCH 38/82] split saved model in config & weights

---
 pytorch_pretrained_bert/modeling_openai.py | 54 +++++++++++-----------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 11325259fb..7e4cd63bba 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -37,7 +37,9 @@ from .modeling import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
 
-PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt.tar.gz"}
+PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-openai_gpt_config.json"}
+
 CONFIG_NAME = "openai_gpt_config.json"
 WEIGHTS_NAME = "pytorch_model.bin"
 
@@ -440,49 +442,42 @@ class OpenAIGPTPreTrainedModel(nn.Module):
         """
         if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
+            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
         else:
             archive_file = pretrained_model_name
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
         # redirect to the cache, if necessary
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
         except EnvironmentError:
             logger.error(
                 "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), archive_file
+                "We assumed '{}' was a path or url but couldn't find files {} and {} "
+                "at this path or url.".format(
+                    pretrained_model_name, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                    archive_file, config_file
                 )
             )
             return None
-        if resolved_archive_file == archive_file:
-            logger.info("loading archive file {}".format(archive_file))
+        if resolved_archive_file == archive_file and resolved_config_file == config_file:
+            logger.info("loading weights file {}".format(archive_file))
+            logger.info("loading configuration file {}".format(config_file))
         else:
-            logger.info("loading archive file {} from cache at {}".format(archive_file, resolved_archive_file))
-        tempdir = None
-        if os.path.isdir(resolved_archive_file):
-            serialization_dir = resolved_archive_file
-        else:
-            # Extract archive to temp dir
-            tempdir = tempfile.mkdtemp()
-            logger.info("extracting archive file {} to temp dir {}".format(resolved_archive_file, tempdir))
-            with tarfile.open(resolved_archive_file, "r:gz") as archive:
-                archive.extractall(tempdir)
-            serialization_dir = tempdir
+            logger.info("loading weights file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
         # Load config
-        config_file = os.path.join(serialization_dir, CONFIG_NAME)
-        config = OpenAIGPTConfig.from_json_file(config_file)
+        config = OpenAIGPTConfig.from_json_file(resolved_config_file)
         logger.info("Model config {}".format(config))
         # Instantiate model.
         model = cls(config, *inputs, **kwargs)
         if state_dict is None and not from_tf:
-            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
-            state_dict = torch.load(weights_path, map_location='cpu' if not torch.cuda.is_available() else None)
-        if tempdir:
-            # Clean up temp dir
-            shutil.rmtree(tempdir)
+            state_dict = torch.load(resolved_archive_file, map_location='cpu' if not torch.cuda.is_available() else None)
         if from_tf:
             # Directly load from a TensorFlow checkpoint (stored as NumPy array)
-            return load_tf_weights_in_openai_gpt(model, serialization_dir)
+            return load_tf_weights_in_openai_gpt(model, resolved_archive_file)
 
         old_keys = []
         new_keys = []
@@ -535,6 +530,7 @@ class OpenAIGPTPreTrainedModel(nn.Module):
             raise RuntimeError(
                 "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
             )
+
         # Add additional embeddings for special tokens if needed
         # This step also make sure we are still sharing the output and input embeddings after loading weights
         model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special)
@@ -711,7 +707,9 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens):
-        " Update input and output embeddings with new embedding matrice "
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
+        """
         self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
 
@@ -792,7 +790,9 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens):
-        " Update input and output embeddings with new embedding matrice "
+        """ Update input and output embeddings with new embedding matrice
+            Make sure we are sharing the embeddings
+        """
         self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
 

From d482e3d79d7c4872459efb94f569c03b2a0a99d0 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 7 Feb 2019 17:06:41 +0100
Subject: [PATCH 39/82] adding examples for openai and transformer-xl

---
 examples/openai_gpt_train.py                  | 344 ++++++++++
 ...{eval_transfo_xl.py => transfo_xl_eval.py} |  28 +-
 examples/transfo_xl_train.py                  | 595 ++++++++++++++++++
 3 files changed, 947 insertions(+), 20 deletions(-)
 create mode 100644 examples/openai_gpt_train.py
 rename examples/{eval_transfo_xl.py => transfo_xl_eval.py} (87%)
 create mode 100644 examples/transfo_xl_train.py

diff --git a/examples/openai_gpt_train.py b/examples/openai_gpt_train.py
new file mode 100644
index 0000000000..7a3dd90988
--- /dev/null
+++ b/examples/openai_gpt_train.py
@@ -0,0 +1,344 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" OpenAI GPT model fine-tuning script.
+    Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py
+    It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py
+
+    This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset
+"""
+import argparse
+import os
+import csv
+import random
+import logging
+from tqdm import tqdm
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from sklearn.metrics import accuracy_score
+from sklearn.utils import shuffle
+
+from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam
+
+# from analysis import rocstories as rocstories_analysis
+# from datasets import rocstories
+# from model_pytorch import DoubleHeadModel, load_openai_pretrained_model
+# from opt import OpenAIAdam
+# from text_utils import TextEncoder
+# from utils import (encode_dataset, iter_data,
+#                    ResultLogger, make_path)
+# from loss import MultipleChoiceLossCompute
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+def iter_apply(Xs, Ms, Ys):
+    # fns = [lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x))]
+    logits = []
+    cost = 0
+    with torch.no_grad():
+        dh_model.eval()
+        for xmb, mmb, ymb in iter_data(Xs, Ms, Ys, n_batch=n_batch_train, truncate=False, verbose=True):
+            n = len(xmb)
+            XMB = torch.tensor(xmb, dtype=torch.long).to(device)
+            YMB = torch.tensor(ymb, dtype=torch.long).to(device)
+            MMB = torch.tensor(mmb).to(device)
+            _, clf_logits = dh_model(XMB)
+            clf_logits *= n
+            clf_losses = compute_loss_fct(XMB, YMB, MMB, clf_logits, only_return_losses=True)
+            clf_losses *= n
+            logits.append(clf_logits.to("cpu").numpy())
+            cost += clf_losses.sum().item()
+        logits = np.concatenate(logits, 0)
+    return logits, cost
+
+
+def iter_predict(Xs, Ms):
+    logits = []
+    with torch.no_grad():
+        dh_model.eval()
+        for xmb, mmb in iter_data(Xs, Ms, n_batch=n_batch_train, truncate=False, verbose=True):
+            n = len(xmb)
+            XMB = torch.tensor(xmb, dtype=torch.long).to(device)
+            MMB = torch.tensor(mmb).to(device)
+            _, clf_logits = dh_model(XMB)
+            logits.append(clf_logits.to("cpu").numpy())
+    logits = np.concatenate(logits, 0)
+    return logits
+
+
+def log(save_dir, desc):
+    global best_score
+    print("Logging")
+    tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid], trY[:n_valid])
+    va_logits, va_cost = iter_apply(vaX, vaM, vaY)
+    tr_cost = tr_cost / len(trY[:n_valid])
+    va_cost = va_cost / n_valid
+    tr_acc = accuracy_score(trY[:n_valid], np.argmax(tr_logits, 1)) * 100.
+    va_acc = accuracy_score(vaY, np.argmax(va_logits, 1)) * 100.
+    logger.log(n_epochs=n_epochs, n_updates=n_updates, tr_cost=tr_cost, va_cost=va_cost, tr_acc=tr_acc, va_acc=va_acc)
+    print('%d %d %.3f %.3f %.2f %.2f' % (n_epochs, n_updates, tr_cost, va_cost, tr_acc, va_acc))
+    if submit:
+        score = va_acc
+        if score > best_score:
+            best_score = score
+            path = os.path.join(save_dir, desc, 'best_params')
+            torch.save(dh_model.state_dict(), make_path(path))
+
+
+def predict(dataset, submission_dir):
+    filename = filenames[dataset]
+    pred_fn = pred_fns[dataset]
+    label_decoder = label_decoders[dataset]
+    predictions = pred_fn(iter_predict(teX, teM))
+    if label_decoder is not None:
+        predictions = [label_decoder[prediction] for prediction in predictions]
+    path = os.path.join(submission_dir, filename)
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with open(path, 'w') as f:
+        f.write('{}\t{}\n'.format('index', 'prediction'))
+        for i, prediction in enumerate(predictions):
+            f.write('{}\t{}\n'.format(i, prediction))
+
+
+def run_epoch():
+    for xmb, mmb, ymb in iter_data(*shuffle(trX, trM, trYt, random_state=np.random),
+                                   n_batch=n_batch_train, truncate=True, verbose=True):
+        global n_updates
+        dh_model.train()
+        XMB = torch.tensor(xmb, dtype=torch.long).to(device)
+        YMB = torch.tensor(ymb, dtype=torch.long).to(device)
+        MMB = torch.tensor(mmb).to(device)
+        lm_logits, clf_logits = dh_model(XMB)
+        compute_loss_fct(XMB, YMB, MMB, clf_logits, lm_logits)
+        n_updates += 1
+        if n_updates in [1000, 2000, 4000, 8000, 16000, 32000] and n_epochs == 0:
+            log(save_dir, desc)
+
+
+def accuracy(out, labels):
+    outputs = np.argmax(out, axis=1)
+    return np.sum(outputs == labels)
+
+def load_rocstories_dataset(dataset_path):
+    """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
+    with open(dataset_path, encoding='utf_8') as f:
+        f = csv.reader(f)
+        output = []
+        next(f) # skip the first line
+        for line in tqdm(f):
+            output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
+    return output
+
+def pre_process_dataset(encoded_dataset, max_len, start_token, delimiter_token, clf_token):
+    n_batch = len(dataset)
+    input_ids = np.zeros((n_batch, 2, max_len), dtype=np.int32)
+    mc_token_mask = np.zeros((n_batch, 2, max_len), dtype=np.int32)
+    lm_labels = np.full((n_batch, 2, max_len), -1, dtype=np.float32)
+    mc_labels = np.zeros((n_batch,), dtype=np.float32)
+    for i, (story, cont1, cont2, mc_label), in enumerate(encoded_dataset):
+        with_cont1 = [start_token] + story[:max_len] + [delimiter_token] + cont1[:max_len] + [clf_token]
+        with_cont2 = [start_token] + story[:max_len] + [delimiter_token] + cont2[:max_len] + [clf_token]
+        xmb[i, 0, :len(with_cont1)] = with_cont1
+        xmb[i, 1, :len(with_cont2)] = with_cont2
+        mc_token_mask[i, 0, len(with_cont1) - 1] = 1
+        lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:]
+        lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:]
+        mc_labels[i] = mc_label
+    all_inputs = (input_ids, mc_token_mask, lm_labels, mc_labels)
+    all_input_tensors = list(torch.tensor(t) for t in all_inputs)
+    return all_input_tensors
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name', type=str, default='openai-gpt',
+                        help='pretrained model name')
+    parser.add_argument('--data_dir', type=str, default='data/')
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--num_train_epochs', type=int, default=3)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--max_grad_norm', type=int, default=1)
+    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
+    parser.add_argument('--warmup_proportion', type=float, default=0.002)
+    parser.add_argument('--max_grad_norm', type=float, default=1)
+    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
+    parser.add_argument('--weight_decay', type=float, default=0.01)
+    parser.add_argument('--lm_coef', type=float, default=0.5)
+    parser.add_argument('--n_valid', type=int, default=374)
+    args = parser.parse_args()
+    print(args)
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed_all(args.seed)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    n_gpu = torch.cuda.device_count()
+    logger.info("device", device, "n_gpu", n_gpu)
+
+    # Load tokenizer and model
+    # This loading functions also add new tokens and embeddings called `special tokens`
+    # These new embeddings will be fine-tuned on the RocStories dataset
+    special_tokens = ['_start_', '_delimiter_', '_classify_']
+    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
+    special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
+    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
+
+    # Load the dataset and prepare the inputs
+    logger.info("Encoding dataset...")
+    dataset = load_rocstories_dataset(args.dataset_path)
+    tokenized_dataset = list(list(tokenizer.tokenize(x) for x in instance) for instance in dataset)
+    encoded_dataset = list(list(tokenizer.convert_tokens_to_ids(x) for x in instance) for instance in tokenized_dataset)
+
+    max_input_length = max(len(story)+max(len(cont1), len(cont2))+3 for story, cont1, cont2, _ in encoded_dataset)
+    max_input_length = min(max_input_length, model.config.n_positions)  # Max size of input for the pre-trained model
+    max_sub_part_length = max_input_length // 2 - 2
+
+    # Prepare dataloader
+    dataset_tensors = pre_process_dataset(encoded_dataset, max_sub_part_length, *special_tokens_ids)
+    train_data = TensorDataset(*dataset_tensors)
+    train_sampler = RandomSampler(train_data)
+    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    # Prepare optimizer
+    param_optimizer = list(model.named_parameters())
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    num_train_optimization_steps = len(train_data) // args.train_batch_size
+    optimizer = OpenAIAdam(optimizer_grouped_parameters,
+                           lr=args.learning_rate,
+                           warmup=args.warmup_proportion,
+                           max_grad_norm=args.max_grad_norm,
+                           weight_decay=arsg.weight_decay,
+                           t_total=num_train_optimization_steps)
+
+    if args.do_train:
+        global_step = 0
+        nb_tr_steps = 0
+        tr_loss = 0
+        model.train()
+        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
+            tr_loss = 0
+            nb_tr_examples, nb_tr_steps = 0, 0
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, mc_token_mask, lm_labels, mc_labels = batch
+                losses = model(input_ids, mc_token_mask, lm_labels, mc_labels)
+                loss = args.lm_coef * losses[0] + losses[1]
+                loss.backward()
+                tr_loss += loss.item()
+                nb_tr_examples += input_ids.size(0)
+                nb_tr_steps += 1
+
+    # Save a trained model
+    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
+    if args.do_train:
+        torch.save(model_to_save.state_dict(), output_model_file)
+
+    # Load a trained model that you have fine-tuned
+    model_state_dict = torch.load(output_model_file)
+    model = OpenAIGPTDoubleHeadsModel(args.mode, state_dict=model_state_dict, num_labels=num_labels)
+    model.to(device)
+
+    if args.do_eval:
+        eval_examples = processor.get_dev_examples(args.data_dir)
+        eval_features = convert_examples_to_features(
+            eval_examples, label_list, args.max_seq_length, tokenizer)
+        logger.info("***** Running evaluation *****")
+        logger.info("  Num examples = %d", len(eval_examples))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
+        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+        # Run prediction for full data
+        eval_sampler = SequentialSampler(eval_data)
+        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+        model.eval()
+        eval_loss, eval_accuracy = 0, 0
+        nb_eval_steps, nb_eval_examples = 0, 0
+ 
+        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
+            input_ids = input_ids.to(device)
+            input_mask = input_mask.to(device)
+            segment_ids = segment_ids.to(device)
+            label_ids = label_ids.to(device)
+
+            with torch.no_grad():
+                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
+                logits = model(input_ids, segment_ids, input_mask)
+
+            logits = logits.detach().cpu().numpy()
+            label_ids = label_ids.to('cpu').numpy()
+            tmp_eval_accuracy = accuracy(logits, label_ids)
+
+            eval_loss += tmp_eval_loss.mean().item()
+            eval_accuracy += tmp_eval_accuracy
+
+            nb_eval_examples += input_ids.size(0)
+            nb_eval_steps += 1
+
+        eval_loss = eval_loss / nb_eval_steps
+        eval_accuracy = eval_accuracy / nb_eval_examples
+        loss = tr_loss/nb_tr_steps if args.do_train else None
+        result = {'eval_loss': eval_loss,
+                  'eval_accuracy': eval_accuracy,
+                  'global_step': global_step,
+                  'loss': loss}
+
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+if __name__ == "__main__":
+    main()
+
+    n_updates = 0
+    n_epochs = 0
+    if dataset != 'stsb':
+        trYt = trY
+    if submit:
+        path = os.path.join(save_dir, desc, 'best_params')
+        torch.save(dh_model.state_dict(), make_path(path))
+    best_score = 0
+    for i in range(args.n_iter):
+        print("running epoch", i)
+        run_epoch()
+        n_epochs += 1
+        log(save_dir, desc)
+    if submit:
+        path = os.path.join(save_dir, desc, 'best_params')
+        dh_model.load_state_dict(torch.load(path))
+        predict(dataset, args.submission_dir)
+        if args.analysis:
+            rocstories_analysis(data_dir, os.path.join(args.submission_dir, 'ROCStories.tsv'),
+                                os.path.join(log_dir, 'rocstories.jsonl'))
diff --git a/examples/eval_transfo_xl.py b/examples/transfo_xl_eval.py
similarity index 87%
rename from examples/eval_transfo_xl.py
rename to examples/transfo_xl_eval.py
index 9a0975f186..4f3606a97e 100644
--- a/examples/eval_transfo_xl.py
+++ b/examples/transfo_xl_eval.py
@@ -16,17 +16,15 @@
 """ PyTorch Transformer XL model evaluation script.
     Adapted from https://github.com/kimiyoung/transformer-xl.
     In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
+
+    This script with default values evaluates a pretrained Transformer-XL on WikiText 103
 """
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import os
-import functools
 import argparse
 import logging
 import time
 import math
-import sys
-from io import open
 
 import torch
 
@@ -39,10 +37,7 @@ logger = logging.getLogger(__name__)
 
 
 parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
-# parser.add_argument('--data', type=str, default='../data/wikitext-103',
-#                     help='location of the data corpus')
 parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
-                    # choices=['transfo-xl-wt103'], #, 'lm1b', 'enwik8', 'text8'],
                     help='pretrained model name')
 parser.add_argument('--split', type=str, default='test',
                     choices=['all', 'valid', 'test'],
@@ -70,11 +65,11 @@ assert args.ext_len >= 0, 'extended context length must be non-negative'
 
 device = torch.device("cuda" if args.cuda else "cpu")
 
-# Get logger
-# logging = get_logger(os.path.join(args.work_dir, 'log.txt'),
-#                      log_=not args.no_log)
-
-# Load dataset
+# Load a pre-processed dataset
+# You can also build the corpus yourself using TransfoXLCorpus methods
+# The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
+# and tokenizing the dataset
+# The pre-processed corpus is a convertion (using the conversion script )
 corpus = TransfoXLCorpus.from_pretrained(args.model_name)
 ntokens = len(corpus.vocab)
 
@@ -83,10 +78,7 @@ va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
 te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
     device=device, ext_len=args.ext_len)
 
-# Load the best saved model.
-# with open(os.path.join(args.work_dir, 'model.pt'), 'rb') as f:
-#     model = torch.load(f)
-# model.backward_compatible()
+# Load a pre-trained model
 model = TransfoXLModel.from_pretrained(args.model_name)
 model = model.to(device)
 
@@ -132,10 +124,6 @@ elif args.split == 'test':
     valid_loss = None
 
 def format_log(loss, split):
-    # if args.dataset in ['enwik8', 'text8']:
-    #     log_str = '| {0} loss {1:5.2f} | {0} bpc {2:9.5f} '.format(
-    #         split, loss, loss / math.log(2))
-    # else:
     log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
         split, loss, math.exp(loss))
     return log_str
diff --git a/examples/transfo_xl_train.py b/examples/transfo_xl_train.py
new file mode 100644
index 0000000000..09d30aed28
--- /dev/null
+++ b/examples/transfo_xl_train.py
@@ -0,0 +1,595 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Transformer XL model training script.
+    Adapted from https://github.com/kimiyoung/transformer-xl.
+    In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
+
+    This script with default values train a Transformer-XL on WikiText 103
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import functools
+import argparse
+import logging
+import time
+import math
+import sys
+from io import open
+import itertools
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+from pytorch_pretrained_bert import TransfoXLModel, TransfoXLConfig
+from pytorch_pretrained_bert.tokenization_transfo_xl import get_lm_corpus
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
+parser.add_argument('--data', type=str, default='../data/wikitext-103',
+                    help='location of the data corpus')
+parser.add_argument('--dataset', type=str, default='wt103',
+                    choices=['wt103', 'lm1b', 'enwik8', 'text8'],
+                    help='dataset name')
+parser.add_argument('--n_layer', type=int, default=12,
+                    help='number of total layers')
+parser.add_argument('--n_head', type=int, default=10,
+                    help='number of heads')
+parser.add_argument('--d_head', type=int, default=50,
+                    help='head dimension')
+parser.add_argument('--d_embed', type=int, default=-1,
+                    help='embedding dimension')
+parser.add_argument('--d_model', type=int, default=500,
+                    help='model dimension')
+parser.add_argument('--d_inner', type=int, default=1000,
+                    help='inner dimension in FF')
+parser.add_argument('--dropout', type=float, default=0.0,
+                    help='global dropout rate')
+parser.add_argument('--dropatt', type=float, default=0.0,
+                    help='attention probability dropout rate')
+parser.add_argument('--init', default='normal', type=str,
+                    help='parameter initializer to use.')
+parser.add_argument('--emb_init', default='normal', type=str,
+                    help='parameter initializer to use.')
+parser.add_argument('--init_range', type=float, default=0.1,
+                    help='parameters initialized by U(-init_range, init_range)')
+parser.add_argument('--emb_init_range', type=float, default=0.01,
+                    help='parameters initialized by U(-init_range, init_range)')
+parser.add_argument('--init_std', type=float, default=0.02,
+                    help='parameters initialized by N(0, init_std)')
+parser.add_argument('--proj_init_std', type=float, default=0.01,
+                    help='parameters initialized by N(0, init_std)')
+parser.add_argument('--optim', default='adam', type=str,
+                    choices=['adam', 'sgd', 'adagrad'],
+                    help='optimizer to use.')
+parser.add_argument('--lr', type=float, default=0.00025,
+                    help='initial learning rate (0.00025|5 for adam|sgd)')
+parser.add_argument('--mom', type=float, default=0.0,
+                    help='momentum for sgd')
+parser.add_argument('--scheduler', default='cosine', type=str,
+                    choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'],
+                    help='lr scheduler to use.')
+parser.add_argument('--warmup_step', type=int, default=0,
+                    help='upper epoch limit')
+parser.add_argument('--decay_rate', type=float, default=0.5,
+                    help='decay factor when ReduceLROnPlateau is used')
+parser.add_argument('--lr_min', type=float, default=0.0,
+                    help='minimum learning rate during annealing')
+parser.add_argument('--clip', type=float, default=0.25,
+                    help='gradient clipping')
+parser.add_argument('--clip_nonemb', action='store_true',
+                    help='only clip the gradient of non-embedding params')
+parser.add_argument('--max_step', type=int, default=100000,
+                    help='upper epoch limit')
+parser.add_argument('--batch_size', type=int, default=60,
+                    help='batch size')
+parser.add_argument('--batch_chunk', type=int, default=1,
+                    help='split batch into chunks to save memory')
+parser.add_argument('--tgt_len', type=int, default=70,
+                    help='number of tokens to predict')
+parser.add_argument('--eval_tgt_len', type=int, default=50,
+                    help='number of tokens to predict for evaluation')
+parser.add_argument('--ext_len', type=int, default=0,
+                    help='length of the extended context')
+parser.add_argument('--mem_len', type=int, default=0,
+                    help='length of the retained previous heads')
+parser.add_argument('--not_tied', action='store_true',
+                    help='do not tie the word embedding and softmax weights')
+parser.add_argument('--seed', type=int, default=1111,
+                    help='random seed')
+parser.add_argument('--cuda', action='store_true',
+                    help='use CUDA')
+parser.add_argument('--adaptive', action='store_true',
+                    help='use adaptive softmax')
+parser.add_argument('--div_val', type=int, default=1,
+                    help='divident value for adapative input and softmax')
+parser.add_argument('--pre_lnorm', action='store_true',
+                    help='apply LayerNorm to the input instead of the output')
+parser.add_argument('--varlen', action='store_true',
+                    help='use variable length')
+parser.add_argument('--multi_gpu', action='store_true',
+                    help='use multiple GPU')
+parser.add_argument('--log-interval', type=int, default=200,
+                    help='report interval')
+parser.add_argument('--eval-interval', type=int, default=4000,
+                    help='evaluation interval')
+parser.add_argument('--work_dir', default='LM-TFM', type=str,
+                    help='experiment directory.')
+parser.add_argument('--restart', action='store_true',
+                    help='restart training from the saved checkpoint')
+parser.add_argument('--restart_dir', type=str, default='',
+                    help='restart dir')
+parser.add_argument('--debug', action='store_true',
+                    help='run in debug mode (do not create exp dir)')
+parser.add_argument('--same_length', action='store_true',
+                    help='use the same attn length for all tokens')
+parser.add_argument('--attn_type', type=int, default=0,
+                    help='attention type. 0 for ours, 1 for Shaw et al,'
+                    '2 for Vaswani et al, 3 for Al Rfou et al.')
+parser.add_argument('--clamp_len', type=int, default=-1,
+                    help='use the same pos embeddings after clamp_len')
+parser.add_argument('--eta_min', type=float, default=0.0,
+                    help='min learning rate for cosine scheduler')
+parser.add_argument('--gpu0_bsz', type=int, default=-1,
+                    help='batch size on gpu 0')
+parser.add_argument('--max_eval_steps', type=int, default=-1,
+                    help='max eval steps')
+parser.add_argument('--sample_softmax', type=int, default=-1,
+                    help='number of samples in sampled softmax')
+parser.add_argument('--patience', type=int, default=0,
+                    help='patience')
+parser.add_argument('--finetune_v2', action='store_true',
+                    help='finetune v2')
+parser.add_argument('--finetune_v3', action='store_true',
+                    help='finetune v3')
+parser.add_argument('--fp16', action='store_true',
+                    help='Run in pseudo-fp16 mode (fp16 storage fp32 math).')
+parser.add_argument('--static-loss-scale', type=float, default=1,
+                    help='Static loss scale, positive power of 2 values can '
+                    'improve fp16 convergence.')
+parser.add_argument('--dynamic-loss-scale', action='store_true',
+                    help='Use dynamic loss scaling.  If supplied, this argument'
+                    ' supersedes --static-loss-scale.')
+args = parser.parse_args()
+args.tied = not args.not_tied
+
+if args.d_embed < 0:
+    args.d_embed = args.d_model
+
+assert args.ext_len >= 0, 'extended context length must be non-negative'
+assert args.batch_size % args.batch_chunk == 0
+
+args.work_dir = '{}-{}'.format(args.work_dir, args.dataset)
+args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S'))
+# logging = create_exp_dir(args.work_dir,
+#     scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug)
+
+# Set the random seed manually for reproducibility.
+np.random.seed(args.seed)
+torch.manual_seed(args.seed)
+if torch.cuda.is_available():
+    if not args.cuda:
+        print('WARNING: You have a CUDA device, so you should probably run with --cuda')
+    else:
+        torch.cuda.manual_seed_all(args.seed)
+
+# Validate `--fp16` option
+if args.fp16:
+    if not args.cuda:
+        print('WARNING: --fp16 requires --cuda, ignoring --fp16 option')
+        args.fp16 = False
+    else:
+        try:
+            from apex.fp16_utils import FP16_Optimizer
+        except ImportError:
+            print('WARNING: apex not installed, ignoring --fp16 option')
+            args.fp16 = False
+
+device = torch.device('cuda' if args.cuda else 'cpu')
+
+###############################################################################
+# Load data
+###############################################################################
+corpus = get_lm_corpus(args.data, args.dataset)
+ntokens = len(corpus.vocab)
+args.n_token = ntokens
+
+eval_batch_size = 10
+tr_iter = corpus.get_iterator('train', args.batch_size, args.tgt_len,
+    device=device, ext_len=args.ext_len)
+va_iter = corpus.get_iterator('valid', eval_batch_size, args.eval_tgt_len,
+    device=device, ext_len=args.ext_len)
+te_iter = corpus.get_iterator('test', eval_batch_size, args.eval_tgt_len,
+    device=device, ext_len=args.ext_len)
+
+# adaptive softmax / embedding
+cutoffs = []
+if args.adaptive:
+    assert args.dataset in ['wt103', 'lm1b']
+    if args.dataset == 'wt103':
+        cutoffs = [20000, 40000, 200000]
+        proj_share_all_but_first = True
+    elif args.dataset == 'lm1b':
+        cutoffs = [60000, 100000, 640000]
+        proj_share_all_but_first = False
+
+###############################################################################
+# Build the model
+###############################################################################
+def init_weight(weight):
+    if args.init == 'uniform':
+        nn.init.uniform_(weight, -args.init_range, args.init_range)
+    elif args.init == 'normal':
+        nn.init.normal_(weight, 0.0, args.init_std)
+
+def init_bias(bias):
+    nn.init.constant_(bias, 0.0)
+
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Linear') != -1:
+        if hasattr(m, 'weight') and m.weight is not None:
+            init_weight(m.weight)
+        if hasattr(m, 'bias') and m.bias is not None:
+            init_bias(m.bias)
+    elif classname.find('AdaptiveEmbedding') != -1:
+        if hasattr(m, 'emb_projs'):
+            for i in range(len(m.emb_projs)):
+                if m.emb_projs[i] is not None:
+                    nn.init.normal_(m.emb_projs[i], 0.0, args.proj_init_std)
+    elif classname.find('Embedding') != -1:
+        if hasattr(m, 'weight'):
+            init_weight(m.weight)
+    elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
+        if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
+            init_weight(m.cluster_weight)
+        if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
+            init_bias(m.cluster_bias)
+        if hasattr(m, 'out_projs'):
+            for i in range(len(m.out_projs)):
+                if m.out_projs[i] is not None:
+                    nn.init.normal_(m.out_projs[i], 0.0, args.proj_init_std)
+    elif classname.find('LayerNorm') != -1:
+        if hasattr(m, 'weight'):
+            nn.init.normal_(m.weight, 1.0, args.init_std)
+        if hasattr(m, 'bias') and m.bias is not None:
+            init_bias(m.bias)
+    elif classname.find('TransformerLM') != -1:
+        if hasattr(m, 'r_emb'):
+            init_weight(m.r_emb)
+        if hasattr(m, 'r_w_bias'):
+            init_weight(m.r_w_bias)
+        if hasattr(m, 'r_r_bias'):
+            init_weight(m.r_r_bias)
+        if hasattr(m, 'r_bias'):
+            init_bias(m.r_bias)
+
+def update_dropout(m):
+    classname = m.__class__.__name__
+    if classname.find('Dropout') != -1:
+        if hasattr(m, 'p'):
+            m.p = args.dropout
+
+def update_dropatt(m):
+    if hasattr(m, 'dropatt'):
+        m.dropatt.p = args.dropatt
+
+if args.restart:
+    with open(os.path.join(args.restart_dir, 'model.pt'), 'rb') as f:
+        model = torch.load(f)
+    if not args.fp16:
+        model = model.float()
+    model.apply(update_dropout)
+    model.apply(update_dropatt)
+else:
+    config = TransfoXLConfig(ntokens, n_layer=args.n_layer, n_head=args.n_head,
+        d_model=args.d_model, d_head=args.d_head, d_inner=args.d_inner,
+        dropout=args.dropout, dropatt=args.dropatt,
+        tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val,
+        proj_share_all_but_first=proj_share_all_but_first,
+        pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len,
+        ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs,
+        same_length=args.same_length, attn_type=args.attn_type,
+        clamp_len=args.clamp_len, sample_softmax=args.sample_softmax)
+    model = TransfoXLModel(config)
+    model.apply(weights_init)
+    model.word_emb.apply(weights_init) # ensure embedding init is not overridden by out_layer in case of weight sharing
+args.n_all_param = sum([p.nelement() for p in model.parameters()])
+args.n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()])
+
+if args.fp16:
+    model = model.half()
+
+if args.multi_gpu:
+    model = model.to(device)
+    if args.gpu0_bsz >= 0:
+        raise NotImplementedError
+        # para_model = BalancedDataParallel(args.gpu0_bsz // args.batch_chunk,
+        #                                   model, dim=1).to(device)
+    else:
+        para_model = nn.DataParallel(model, dim=1).to(device)
+else:
+    para_model = model.to(device)
+
+#### optimizer
+if args.optim.lower() == 'sgd':
+    if args.sample_softmax > 0:
+        dense_params, sparse_params = [], []
+        for param in model.parameters():
+            if param.size() == model.word_emb.weight.size():
+                sparse_params.append(param)
+            else:
+                dense_params.append(param)
+        optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2)
+        optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom)
+    else:
+        optimizer = optim.SGD(model.parameters(), lr=args.lr,
+            momentum=args.mom)
+elif args.optim.lower() == 'adam':
+    if args.sample_softmax > 0:
+        dense_params, sparse_params = [], []
+        for param in model.parameters():
+            if param.size() == model.word_emb.weight.size():
+                sparse_params.append(param)
+            else:
+                dense_params.append(param)
+        optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr)
+        optimizer = optim.Adam(dense_params, lr=args.lr)
+    else:
+        optimizer = optim.Adam(model.parameters(), lr=args.lr)
+elif args.optim.lower() == 'adagrad':
+    optimizer = optim.Adagrad(model.parameters(), lr=args.lr)
+
+#### scheduler
+if args.scheduler == 'cosine':
+    # here we do not set eta_min to lr_min to be backward compatible
+    # because in previous versions eta_min is default to 0
+    # rather than the default value of lr_min 1e-6
+    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
+        args.max_step, eta_min=args.eta_min) # should use eta_min arg
+    if args.sample_softmax > 0:
+        scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR(optimizer_sparse,
+            args.max_step, eta_min=args.eta_min) # should use eta_min arg
+elif args.scheduler == 'inv_sqrt':
+    # originally used for Transformer (in Attention is all you need)
+    def lr_lambda(step):
+        # return a multiplier instead of a learning rate
+        if step == 0 and args.warmup_step == 0:
+            return 1.
+        else:
+            return 1. / (step ** 0.5) if step > args.warmup_step \
+                   else step / (args.warmup_step ** 1.5)
+    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
+elif args.scheduler == 'dev_perf':
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
+        factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min)
+    if args.sample_softmax > 0:
+        scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau(optimizer_sparse,
+            factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min)
+elif args.scheduler == 'constant':
+    pass
+
+if args.cuda and args.fp16:
+    # If args.dynamic_loss_scale is False, static_loss_scale will be used.
+    # If args.dynamic_loss_scale is True, it will take precedence over static_loss_scale.
+    optimizer = FP16_Optimizer(optimizer,
+                               static_loss_scale = args.static_loss_scale,
+                               dynamic_loss_scale = args.dynamic_loss_scale,
+                               dynamic_loss_args = {'init_scale': 2 ** 16})
+
+if args.restart:
+    if os.path.exists(os.path.join(args.restart_dir, 'optimizer.pt')):
+        with open(os.path.join(args.restart_dir, 'optimizer.pt'), 'rb') as f:
+            opt_state_dict = torch.load(f)
+            optimizer.load_state_dict(opt_state_dict)
+    else:
+        print('Optimizer was not saved. Start from scratch.')
+
+logger.info('=' * 100)
+for k, v in args.__dict__.items():
+    logger.info('    - {} : {}'.format(k, v))
+logger.info('=' * 100)
+logger.info('#params = {}'.format(args.n_all_param))
+logger.info('#non emb params = {}'.format(args.n_nonemb_param))
+
+###############################################################################
+# Training code
+###############################################################################
+
+def evaluate(eval_iter):
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+
+    # If the model does not use memory at all, make the ext_len longer.
+    # Otherwise, make the mem_len longer and keep the ext_len the same.
+    if args.mem_len == 0:
+        model.reset_length(args.eval_tgt_len,
+            args.ext_len+args.tgt_len-args.eval_tgt_len, args.mem_len)
+    else:
+        model.reset_length(args.eval_tgt_len,
+            args.ext_len, args.mem_len+args.tgt_len-args.eval_tgt_len)
+
+    # Evaluation
+    total_len, total_loss = 0, 0.
+    with torch.no_grad():
+        mems = tuple()
+        for i, (data, target, seq_len) in enumerate(eval_iter):
+            if args.max_eval_steps > 0 and i >= args.max_eval_steps:
+                break
+            ret = model(data, target, *mems)
+            loss, mems = ret[0], ret[1:]
+            loss = loss.mean()
+            total_loss += seq_len * loss.float().item()
+            total_len += seq_len
+
+    # Switch back to the training mode
+    model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
+    model.train()
+
+    return total_loss / total_len
+
+
+def train():
+    # Turn on training mode which enables dropout.
+    global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
+    model.train()
+    if args.batch_chunk > 1:
+        mems = [tuple() for _ in range(args.batch_chunk)]
+    else:
+        mems = tuple()
+    train_iter = tr_iter.get_varlen_iter() if args.varlen else tr_iter
+    for batch, (data, target, seq_len) in enumerate(train_iter):
+        model.zero_grad()
+        if args.batch_chunk > 1:
+            data_chunks = torch.chunk(data, args.batch_chunk, 1)
+            target_chunks = torch.chunk(target, args.batch_chunk, 1)
+            for i in range(args.batch_chunk):
+                data_i = data_chunks[i].contiguous()
+                target_i = target_chunks[i].contiguous()
+                ret = para_model(data_i, target_i, *mems[i])
+                loss, mems[i] = ret[0], ret[1:]
+                loss = loss.float().mean().type_as(loss) / args.batch_chunk
+                if args.fp16:
+                    optimizer.backward(loss)
+                else:
+                    loss.backward()
+                train_loss += loss.float().item()
+        else:
+            ret = para_model(data, target, *mems)
+            loss, mems = ret[0], ret[1:]
+            loss = loss.float().mean().type_as(loss)
+            if args.fp16:
+                optimizer.backward(loss)
+            else:
+                loss.backward()
+            train_loss += loss.float().item()
+
+        if args.fp16:
+            optimizer.clip_master_grads(args.clip)
+        else:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
+
+        optimizer.step()
+        if args.sample_softmax > 0:
+            optimizer_sparse.step()
+
+        # step-wise learning rate annealing
+        train_step += 1
+        if args.scheduler in ['cosine', 'constant', 'dev_perf']:
+            # linear warmup stage
+            if train_step < args.warmup_step:
+                curr_lr = args.lr * train_step / args.warmup_step
+                optimizer.param_groups[0]['lr'] = curr_lr
+                if args.sample_softmax > 0:
+                    optimizer_sparse.param_groups[0]['lr'] = curr_lr * 2
+            else:
+                if args.scheduler == 'cosine':
+                    scheduler.step(train_step)
+                    if args.sample_softmax > 0:
+                        scheduler_sparse.step(train_step)
+        elif args.scheduler == 'inv_sqrt':
+            scheduler.step(train_step)
+
+        if train_step % args.log_interval == 0:
+            cur_loss = train_loss / args.log_interval
+            elapsed = time.time() - log_start_time
+            log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \
+                      '| ms/batch {:5.2f} | loss {:5.2f}'.format(
+                epoch, train_step, batch+1, optimizer.param_groups[0]['lr'],
+                elapsed * 1000 / args.log_interval, cur_loss)
+            if args.dataset in ['enwik8', 'text8']:
+                log_str += ' | bpc {:9.5f}'.format(cur_loss / math.log(2))
+            else:
+                log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss))
+            logger.info(log_str)
+            train_loss = 0
+            log_start_time = time.time()
+
+        if train_step % args.eval_interval == 0:
+            val_loss = evaluate(va_iter)
+            logger.info('-' * 100)
+            log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \
+                      '| valid loss {:5.2f}'.format(
+                train_step // args.eval_interval, train_step,
+                (time.time() - eval_start_time), val_loss)
+            if args.dataset in ['enwik8', 'text8']:
+                log_str += ' | bpc {:9.5f}'.format(val_loss / math.log(2))
+            else:
+                log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss))
+            logger.info(log_str)
+            logger.info('-' * 100)
+            # Save the model if the validation loss is the best we've seen so far.
+            if not best_val_loss or val_loss < best_val_loss:
+                if not args.debug:
+                    with open(os.path.join(args.work_dir, 'model.pt'), 'wb') as f:
+                        torch.save(model, f)
+                    with open(os.path.join(args.work_dir, 'optimizer.pt'), 'wb') as f:
+                        torch.save(optimizer.state_dict(), f)
+                best_val_loss = val_loss
+
+            # dev-performance based learning rate annealing
+            if args.scheduler == 'dev_perf':
+                scheduler.step(val_loss)
+                if args.sample_softmax > 0:
+                    scheduler_sparse.step(val_loss)
+
+            eval_start_time = time.time()
+
+        if train_step == args.max_step:
+            break
+
+# Loop over epochs.
+train_step = 0
+train_loss = 0
+best_val_loss = None
+
+log_start_time = time.time()
+eval_start_time = time.time()
+
+# At any point you can hit Ctrl + C to break out of training early.
+try:
+    for epoch in itertools.count(start=1):
+        train()
+        if train_step == args.max_step:
+            logger.info('-' * 100)
+            logger.info('End of training')
+            break
+except KeyboardInterrupt:
+    logger.info('-' * 100)
+    logger.info('Exiting from training early')
+
+# Load the best saved model.
+with open(os.path.join(args.work_dir, 'model.pt'), 'rb') as f:
+    model = torch.load(f)
+para_model = model.to(device)
+
+# Run on test data.
+test_loss = evaluate(te_iter)
+logger.info('=' * 100)
+if args.dataset in ['enwik8', 'text8']:
+    logger.info('| End of training | test loss {:5.2f} | test bpc {:9.5f}'.format(
+        test_loss, test_loss / math.log(2)))
+else:
+    logger.info('| End of training | test loss {:5.2f} | test ppl {:9.3f}'.format(
+        test_loss, math.exp(test_loss)))
+logger.info('=' * 100)

From c306869ea2cfeebadd64779408ef7a28132779c9 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 7 Feb 2019 17:07:03 +0100
Subject: [PATCH 40/82] add two transformer xl models

---
 pytorch_pretrained_bert/__init__.py           |   2 +-
 ...onvert_transfo_xl_checkpoint_to_pytorch.py |  10 +-
 .../modeling_transfo_xl.py                    | 226 +++++++++++++-----
 3 files changed, 174 insertions(+), 64 deletions(-)

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index e4b9c1a116..761af86b6d 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -11,7 +11,7 @@ from .modeling import (BertConfig, BertModel, BertForPreTraining,
 from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
                               OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
                               load_tf_weights_in_openai_gpt)
-from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel,
+from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
                                   load_tf_weights_in_transfo_xl)
 
 from .optimization import BertAdam
diff --git a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
index dedea33435..dae1248f71 100755
--- a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -27,7 +27,7 @@ import pytorch_pretrained_bert.tokenization_transfo_xl as data_utils
 from pytorch_pretrained_bert.modeling_transfo_xl import (CONFIG_NAME,
                                                          WEIGHTS_NAME,
                                                          TransfoXLConfig,
-                                                         TransfoXLModel,
+                                                         TransfoXLLMHeadModel,
                                                          load_tf_weights_in_transfo_xl)
 from pytorch_pretrained_bert.tokenization_transfo_xl import (CORPUS_NAME,
                                                              VOCAB_NAME)
@@ -37,7 +37,7 @@ if sys.version_info[0] == 2:
 else:
     import pickle
 
-# We do this to be able to load the python 2 datasets pickles
+# We do this to be able to load python 2 datasets pickles
 # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
 data_utils.Vocab = data_utils.TransfoXLTokenizer
 data_utils.Corpus = data_utils.TransfoXLCorpus
@@ -49,6 +49,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
                                              pytorch_dump_folder_path,
                                              transfo_xl_dataset_file):
     if transfo_xl_dataset_file:
+        # Convert a pre-processed corpus (see original TensorFlow repo)
         with open(transfo_xl_dataset_file, "rb") as fp:
             corpus = pickle.load(fp, encoding="latin1")
         # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
@@ -64,18 +65,18 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
         torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
 
     if tf_checkpoint_path:
+        # Convert a pre-trained TensorFlow model
         config_path = os.path.abspath(transfo_xl_config_file)
         tf_path = os.path.abspath(tf_checkpoint_path)
 
         print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
         # Initialise PyTorch model
-        # Construct model
         if transfo_xl_config_file == "":
             config = TransfoXLConfig()
         else:
             config = TransfoXLConfig(transfo_xl_config_file)
         print("Building PyTorch model from configuration: {}".format(str(config)))
-        model = TransfoXLModel(config)
+        model = TransfoXLLMHeadModel(config)
 
         model = load_tf_weights_in_transfo_xl(model, config, tf_path)
         # Save pytorch-model
@@ -90,7 +91,6 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    ## Required parameters
     parser.add_argument("--pytorch_dump_folder_path",
                         default = None,
                         type = str,
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 53ebca6e92..f3a3eb46fe 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -57,7 +57,7 @@ def build_tf_to_pytorch_map(model, config):
         This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
     """
     tf_to_pt_map = {}
-    # Embeddings cutoffs
+    # Embeddings
     for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
         layer_str = "transformer/adaptive_embed/cutoff_%d/" % i
         tf_to_pt_map.update({
@@ -934,11 +934,11 @@ class TransfoXLPreTrainedModel(nn.Module):
         # Instantiate model.
         model = cls(config, *inputs, **kwargs)
         if state_dict is None and not from_tf:
-            state_dict = torch.load(resolved_archive_file)
+            state_dict = torch.load(resolved_archive_file, map_location='cpu' if not torch.cuda.is_available() else None)
         if from_tf:
             # Directly load from a TensorFlow checkpoint
-            weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
-            return load_tf_weights_in_transfo_xl(model, weights_path)
+            return load_tf_weights_in_transfo_xl(model, config, pretrained_model_name_or_path)
+
         missing_keys = []
         unexpected_keys = []
         error_msgs = []
@@ -965,18 +965,49 @@ class TransfoXLPreTrainedModel(nn.Module):
         if len(error_msgs) > 0:
             raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
                                model.__class__.__name__, "\n\t".join(error_msgs)))
+        # Make sure we are still sharing the input and output embeddings
+        if model.hasattr('tie_weights'):
+            model.tie_weights()
         return model
 
 
 class TransfoXLModel(TransfoXLPreTrainedModel):
+    """Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
+
+    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
+    - you don't need to specify positioning embeddings indices
+    - the tokens in the vocabulary have to be sorted to decreasing frequency.
+
+    Params:
+        config: a TransfoXLConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [sequence_length, batch_size]
+            with the token indices selected in the range [0, self.config.n_token[
+
+    Outputs:
+        A tuple of (last_hidden_state, new_mems)
+        `last_hidden_state`: the encoded-hidden-states at the top of the model
+            as a torch.FloatTensor of size [sequence_length, batch_size, self.config.d_model]
+        `new_mems`: list (num layers) of updated mem states at the entry of each layer
+            each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
+
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
+
+    config = TransfoXLConfig()
+
+    model = TransfoXLModel(config)
+    last_hidden_state, new_mems = model(input_ids)
+
+    # Another time on input_ids_next using the memory:
+    last_hidden_state, new_mems = model(input_ids_next, new_mems)
+    ```
+    """
     def __init__(self, config):
-    # n_token, n_layer, n_head, d_model, d_head, d_inner,
-    #              dropout, dropatt, tie_weight=True, d_embed=None, 
-    #              div_val=1, tie_projs=[False], pre_lnorm=False,
-    #              tgt_len=None, ext_len=None, mem_len=None, 
-    #              cutoffs=[], adapt_inp=False, untie_r=False,
-    #              same_length=False, attn_type=0, clamp_len=-1, 
-    #              sample_softmax=-1, **kwargs):
         super(TransfoXLModel, self).__init__(config)
         self.n_token = config.n_token
 
@@ -1034,31 +1065,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
                         r_r_bias=None if config.untie_r else self.r_r_bias)
                 )
 
-        self.sample_softmax = config.sample_softmax
-        # use sampled softmax
-        if config.sample_softmax > 0:
-            self.out_layer = nn.Linear(config.d_model, config.n_token)
-            if config.tie_weight:
-                self.out_layer.weight = self.word_emb.weight
-            self.tie_weight = config.tie_weight
-            self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
-
-        # use adaptive softmax (including standard softmax)
-        else:
-            self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model, 
-                                                    config.cutoffs, div_val=config.div_val)
-
-            if config.tie_weight:
-                for i in range(len(self.crit.out_layers)):
-                    self.crit.out_layers[i].weight = self.word_emb.emb_layers[i].weight
-
-            if config.tie_projs:
-                for i, tie_proj in enumerate(config.tie_projs):
-                    if tie_proj and config.div_val == 1 and config.d_model != config.d_embed:
-                        self.crit.out_projs[i] = self.word_emb.emb_projs[0]
-                    elif tie_proj and config.div_val != 1:
-                        self.crit.out_projs[i] = self.word_emb.emb_projs[i]
-
         self.same_length = config.same_length
         self.clamp_len = config.clamp_len
 
@@ -1074,6 +1080,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         elif self.attn_type == 3: # absolute deeper SA
             self.r_emb = nn.Parameter(torch.Tensor(
                     self.n_layer, self.max_klen, self.n_head, self.d_head))
+        self.apply(self.init_weights)
 
     def backward_compatible(self):
         self.sample_softmax = -1
@@ -1210,32 +1217,135 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
         return core_out, new_mems
 
-    def forward(self, data, target=None, *mems):
-        # nn.DataParallel does not allow size(0) tensors to be broadcasted.
-        # So, have to initialize size(0) mems inside the model forward.
-        # Moreover, have to return new_mems to allow nn.DataParallel to piece
-        # them together.
-        if not mems:
-            mems = self.init_mems(data)
+    def forward(self, input_ids, mems=None):
+        """ Params:
+                input_ids :: [len, bsz]
+            Returns:
+                tuple (last_hidden, new_mems) where:
+                    new_mems: list (num layers) of mem states at the entry of each layer
+                        shape :: [self.config.mem_len, bsz, self.config.d_model]
+                    last_hidden: output of the last layer:
+                        shape :: [len, bsz, self.config.d_model]
+        """
+        if mems is None:
+            mems = self.init_mems(input_ids)
+        last_hidden, new_mems = self._forward(input_ids, mems=mems)
+        return (last_hidden, new_mems)
 
-        hidden, new_mems = self._forward(data, mems=mems)
-        if target is None:
-            if new_mems is None:
-                return [hidden]
+
+class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
+    """Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
+
+    This model add an (adaptive) softmax head on top of the TransfoXLModel
+
+    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
+    - you don't need to specify positioning embeddings indices
+    - the tokens in the vocabulary have to be sorted to decreasing frequency.
+
+    Call self.tie_weights() if you update/load the weights of the transformer to keep the weights tied.
+
+    Params:
+        config: a TransfoXLConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [sequence_length, batch_size]
+            with the token indices selected in the range [0, self.config.n_token[
+        `target`: a torch.LongTensor of shape [sequence_length, batch_size]
+            with the target token indices selected in the range [0, self.config.n_token[
+
+    Outputs:
+        A tuple of (last_hidden_state, new_mems)
+        `softmax_output`: output of the (adaptive) softmax:
+            if target is None:
+                Negative log likelihood of shape :: [len, bsz] 
             else:
-                return [hidden] + new_mems
+                log probabilities of tokens, shape :: [len, bsz, n_tokens]
+        `new_mems`: list (num layers) of updated mem states at the entry of each layer
+            each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
 
-        tgt_len = target.size(0)
-        pred_hid = hidden[-tgt_len:]
+    Example usage:
+    ```python
+    # Already been converted into BPE token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
+
+    config = TransfoXLConfig()
+
+    model = TransfoXLModel(config)
+    last_hidden_state, new_mems = model(input_ids)
+
+    # Another time on input_ids_next using the memory:
+    last_hidden_state, new_mems = model(input_ids_next, new_mems)
+    ```
+    """
+    def __init__(self, config):
+        super(TransfoXLLMHeadModel, self).__init__(config)
+        self.transformer = TransfoXLModel(config)
+        self.sample_softmax = config.sample_softmax
+        # use sampled softmax
+        if config.sample_softmax > 0:
+            self.out_layer = nn.Linear(config.d_model, config.n_token)
+            self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
+        # use adaptive softmax (including standard softmax)
+        else:
+            self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model, 
+                                                    config.cutoffs, div_val=config.div_val)
+        self.apply(self.init_weights)
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Run this to be sure output and input (adaptive) softmax weights are tied """
+        # sampled softmax
+        if self.sample_softmax > 0:
+            if self.config.tie_weight:
+                self.out_layer.weight = self.transformer.word_emb.weight
+        # adaptive softmax (including standard softmax)
+        else:
+            if self.config.tie_weight:
+                for i in range(len(self.crit.out_layers)):
+                    self.crit.out_layers[i].weight = self.transformer.word_emb.emb_layers[i].weight
+            if self.config.tie_projs:
+                for i, tie_proj in enumerate(self.config.tie_projs):
+                    if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
+                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
+                    elif tie_proj and self.config.div_val != 1:
+                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
+
+    def reset_length(self, tgt_len, ext_len, mem_len):
+        self.transformer.reset_length(tgt_len, ext_len, mem_len)
+
+    def init_mems(self, data):
+        return self.transformer.init_mems(data)
+
+    def forward(self, input_ids, target=None, mems=None):
+        """ Params:
+                input_ids :: [len, bsz]
+                target :: [len, bsz]
+            Returns:
+                tuple(softmax_output, new_mems) where:
+                    new_mems: list (num layers) of hidden states at the entry of each layer
+                        shape :: [mem_len, bsz, self.config.d_model]
+                    softmax_output: output of the (adaptive) softmax:
+                        if target is None:
+                            Negative log likelihood of shape :: [len, bsz] 
+                        else:
+                            log probabilities of tokens, shape :: [len, bsz, n_tokens]
+        """
+        bsz = input_ids.size(1)
+        tgt_len = input_ids.size(0)
+
+        last_hidden, new_mems = self.transformer(input_ids, mems)
+
+        pred_hid = last_hidden[-tgt_len:]
         if self.sample_softmax > 0 and self.training:
-            assert self.tie_weight
-            logit = sample_logits(self.word_emb, self.out_layer.bias, target, pred_hid, self.sampler)
+            assert self.config.tie_weight
+            logit = sample_logits(self.transformer.word_emb, self.out_layer.bias, target, pred_hid, self.sampler)
             loss = -F.log_softmax(logit, -1)[:, :, 0]
         else:
-            loss = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target.view(-1))
-            loss = loss.view(tgt_len, -1)
+            softmax_output = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target)
+            if target is None:
+                softmax_output = softmax_output.view(tgt_len, bsz, -1)
+            else:
+                softmax_output = softmax_output.view(tgt_len, bsz)
 
-        if new_mems is None:
-            return [loss]
-        else:
-            return (loss, new_mems)
+        return (softmax_output, new_mems)

From 438db43d46d3e7285bc02760c2ed19ee505b2cda Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 7 Feb 2019 17:07:15 +0100
Subject: [PATCH 41/82] update adaptive softmax head

---
 .../modeling_transfo_xl_utilities.py          | 82 ++++++++++++-------
 1 file changed, 52 insertions(+), 30 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
index 52f80e380f..37c38d3776 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
@@ -89,24 +89,35 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
 
         return logit
 
-    def forward(self, hidden, target, keep_order=False):
+    def forward(self, hidden, target=None, keep_order=False):
         '''
-            hidden :: [len*bsz x d_proj]
-            target :: [len*bsz]
+            Params:
+                hidden :: [len*bsz x d_proj]
+                target :: [len*bsz]
+            Return:
+                if target is None:
+                    out :: [len*bsz] Negative log likelihood
+                else:
+                    out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
             We could replace this implementation by the native PyTorch one
-            if their was an option to set bias on all clusters in the native one.
-            line https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
+            if their's had an option to set bias on all clusters in the native one.
+            here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
         '''
 
-        if hidden.size(0) != target.size(0):
-            raise RuntimeError('Input and target should have the same size '
-                               'in the batch dimension.')
+        if target is not None:
+            target = target.view(-1)
+            if hidden.size(0) != target.size(0):
+                raise RuntimeError('Input and target should have the same size '
+                                'in the batch dimension.')
 
         if self.n_clusters == 0:
             logit = self._compute_logit(hidden, self.out_layers[0].weight,
                                         self.out_layers[0].bias, self.out_projs[0])
-            nll = -F.log_softmax(logit, dim=-1) \
-                    .gather(1, target.unsqueeze(1)).squeeze(1)
+            if target is not None:
+                output = -F.log_softmax(logit, dim=-1) \
+                        .gather(1, target.unsqueeze(1)).squeeze(1)
+            else:
+                output = F.log_softmax(logit, dim=-1)
         else:
             # construct weights and biases
             weights, biases = [], []
@@ -133,44 +144,55 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
             head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
             head_logprob = F.log_softmax(head_logit, dim=1)
 
-            nll = torch.zeros_like(target,
-                    dtype=hidden.dtype, device=hidden.device)
+            if target is None:
+                out = hidden.new_empty((head_logit.size(0), self.n_token))
+            else:
+                out = torch.zeros_like(target, dtype=hidden.dtype, device=hidden.device)
 
             offset = 0
             cutoff_values = [0] + self.cutoffs
             for i in range(len(cutoff_values) - 1):
                 l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]
 
-                mask_i = (target >= l_idx) & (target < r_idx)
-                indices_i = mask_i.nonzero().squeeze()
+                if target is not None:
+                    mask_i = (target >= l_idx) & (target < r_idx)
+                    indices_i = mask_i.nonzero().squeeze()
 
-                if indices_i.numel() == 0:
-                    continue
+                    if indices_i.numel() == 0:
+                        continue
 
-                target_i = target.index_select(0, indices_i) - l_idx
-                head_logprob_i = head_logprob.index_select(0, indices_i)
+                    target_i = target.index_select(0, indices_i) - l_idx
+                    head_logprob_i = head_logprob.index_select(0, indices_i)
+                    hidden_i = hidden.index_select(0, indices_i)
+                else:
+                    hidden_i = hidden
 
                 if i == 0:
-                    logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1)
+                    if target is not None:
+                        logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1)
+                    else:
+                        out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
                 else:
                     weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
 
-                    hidden_i = hidden.index_select(0, indices_i)
-
                     tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
                     tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
                     cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
-                    logprob_i = head_logprob_i[:, cluster_prob_idx] \
-                              + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1)
+                    if target is not None:
+                        logprob_i = head_logprob_i[:, cluster_prob_idx] \
+                                + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1)
+                    else:
+                        logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i
+                        out[:, l_idx:r_idx] = logprob_i
 
-                if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
-                    nll.index_copy_(0, indices_i, -logprob_i)
-                else:
-                    nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
+                if target is not None:
+                    if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
+                        out.index_copy_(0, indices_i, -logprob_i)
+                    else:
+                        out[offset:offset+logprob_i.size(0)].copy_(-logprob_i)
+                    offset += logprob_i.size(0)
 
-                offset += logprob_i.size(0)
-
-        return nll
+        return out
 
 
     def log_prob(self, hidden):

From f99f2fb6618b20ff2c290ba23e182a5ac1ac000a Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 7 Feb 2019 17:07:22 +0100
Subject: [PATCH 42/82] docstrings

---
 README.md                                     | 103 +++++++++++++++---
 .../tokenization_openai.py                    |   3 +-
 2 files changed, 87 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 8f83971893..0783a62776 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,25 @@
-# PyTorch Pretrained Bert (also with PyTorch Pretrained OpenAI GPT)
+# PyTorch Pretrained BERT: The Big and Extending Repository of (pre-trained) Transformers
 
 [![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT)
 
-This repository contains an op-for-op PyTorch reimplementation of [Google's TensorFlow repository for the BERT model](https://github.com/google-research/bert) and of [OpenAI's TensorFlow repository for the OpenAI GPT model](https://github.com/openai/finetune-transformer-lm)
+This repository contains op-for-op PyTorch reimplementations, pre-trained models and fine-tuning examples for:
 
-BERT that was released together with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+- [Google's BERT model](https://github.com/google-research/bert),
+- [OpenAI's GPT model](https://github.com/openai/finetune-transformer-lm), and
+- [Google/CMU's Transformer-XL model](https://github.com/kimiyoung/transformer-xl).
+
+These implementations have been tested on several datasets (see the examples) and should match the performances of the associated TensorFlow implementations (e.g. ~91 F1 on SQuAD for BERT, ~88 F1 on RocStories for OpenAI GPT and ~18.3 perplexity on WikiText 103 for the Transformer-XL). You can find more details in the [Examples](#examples) section below.
+
+Here are some information on these models:
+
+**BERT** was released together with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 This PyTorch implementation of BERT is provided with [Google's pre-trained models](https://github.com/google-research/bert), examples, notebooks and a command-line interface to load any pre-trained TensorFlow checkpoint for BERT is also provided.
 
-OpenAI GPT that was released together with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-This PyTorch implementation of OpenAI GPT is provided with [OpenAI's pre-trained model](https://github.com/openai/finetune-transformer-lm) and a command-line interface that was used to convert the pre-trained NumPy checkpoint in the provided PyTorch model.
+**OpenAI GPT** was released together with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+This PyTorch implementation of OpenAI GPT is provided with [OpenAI's pre-trained model](https://github.com/openai/finetune-transformer-lm) and a command-line interface that was used to convert the pre-trained NumPy checkpoint in PyTorch.
+
+**Google/CMU's Transformer-XL** was released together with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](http://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+This PyTorch implementation of Transformer-XL is an adaptation of the original [PyTorch implementation](https://github.com/kimiyoung/transformer-xl) which has been slightly modifier to match the performances of the TensforFlow implementation and allow to re-use the pretrained weights. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
 
 ## Content
 
@@ -52,7 +63,7 @@ python -m pytest -sv tests/
 
 This package comprises the following classes that can be imported in Python and are detailed in the [Doc](#doc) section of this readme:
 
-- Eight PyTorch models (`torch.nn.Module`) for Bert with pre-trained weights (in the [`modeling.py`](./pytorch_pretrained_bert/modeling.py) file):
+- Eight **Bert** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling.py`](./pytorch_pretrained_bert/modeling.py) file):
   - [`BertModel`](./pytorch_pretrained_bert/modeling.py#L556) - raw BERT Transformer model (**fully pre-trained**),
   - [`BertForMaskedLM`](./pytorch_pretrained_bert/modeling.py#L710) - BERT Transformer with the pre-trained masked language modeling head on top (**fully pre-trained**),
   - [`BertForNextSentencePrediction`](./pytorch_pretrained_bert/modeling.py#L771) - BERT Transformer with the pre-trained next sentence prediction classifier on top  (**fully pre-trained**),
@@ -62,40 +73,46 @@ This package comprises the following classes that can be imported in Python and
   - [`BertForTokenClassification`](./pytorch_pretrained_bert/modeling.py#L969) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**),
   - [`BertForQuestionAnswering`](./pytorch_pretrained_bert/modeling.py#L1034) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**).
 
-- Three PyTorch models (`torch.nn.Module`) for OpenAI with pre-trained weights (in the [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py) file):
+- Three **OpenAI GPT** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py) file):
   - [`OpenAIGPTModel`](./pytorch_pretrained_bert/modeling_openai.py#L537) - raw OpenAI GPT Transformer model (**fully pre-trained**),
   - [`OpenAIGPTLMHeadModel`](./pytorch_pretrained_bert/modeling_openai.py#L691) - OpenAI GPT Transformer with the tied language modeling head on top (**fully pre-trained**),
   - [`OpenAIGPTDoubleHeadsModel`](./pytorch_pretrained_bert/modeling_openai.py#L752) - OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
 
-- Three tokenizers for BERT (in the [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) file):
+- Tokenizers for **BERT** (using word-piece) (in the [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) file):
   - `BasicTokenizer` - basic tokenization (punctuation splitting, lower casing, etc.),
   - `WordpieceTokenizer` - WordPiece tokenization,
   - `BertTokenizer` - perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
 
-- One tokenizers for OpenAI GPT (in the [`tokenization_openai.py`](./pytorch_pretrained_bert/tokenization_openai.py) file):
+- Tokenizer for **OpenAI GPT** (using Byte-Pair-Encoding) (in the [`tokenization_openai.py`](./pytorch_pretrained_bert/tokenization_openai.py) file):
   - `OpenAIGPTTokenizer` - perform Byte-Pair-Encoding (BPE) tokenization,
 
-- One optimizer for BERT (in the [`optimization.py`](./pytorch_pretrained_bert/optimization.py) file):
+- Optimizer for **BERT** (in the [`optimization.py`](./pytorch_pretrained_bert/optimization.py) file):
   - `BertAdam` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
-- One optimizer for OpenAI GPT (in the [`optimization_openai.py`](./pytorch_pretrained_bert/optimization_openai.py) file):
+- Optimizer for **OpenAI GPT** (in the [`optimization_openai.py`](./pytorch_pretrained_bert/optimization_openai.py) file):
   - `OpenAIGPTAdam` - OpenAI GPT version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
 
-- A configuration class for BERT (in the [`modeling.py`](./pytorch_pretrained_bert/modeling.py) file):
+- Configuration classes for BERT, OpenAI GPT and Transformer-XL (in the respective [`modeling.py`](./pytorch_pretrained_bert/modeling.py), [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py), [`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py) files):
   - `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilities to read and write from JSON configuration files.
-
-- A configuration class for OpenAI GPT (in the [`modeling_openai.py`](./pytorch_pretrained_bert/modeling_openai.py) file):
   - `OpenAIGPTConfig` - Configuration class to store the configuration of a `OpenAIGPTModel` with utilities to read and write from JSON configuration files.
+  - `TransfoXLConfig` - Configuration class to store the configuration of a `TransfoXLModel` with utilities to read and write from JSON configuration files.
 
 The repository further comprises:
 
-- Five examples on how to use Bert (in the [`examples` folder](./examples)):
+- Five examples on how to use **BERT** (in the [`examples` folder](./examples)):
   - [`extract_features.py`](./examples/extract_features.py) - Show how to extract hidden states from an instance of `BertModel`,
   - [`run_classifier.py`](./examples/run_classifier.py) - Show how to fine-tune an instance of `BertForSequenceClassification` on GLUE's MRPC task,
-  - [`run_squad.py`](./examples/run_squad.py) - Show how to fine-tune an instance of `BertForQuestionAnswering` on SQuAD v1.0 task.
+  - [`run_squad.py`](./examples/run_squad.py) - Show how to fine-tune an instance of `BertForQuestionAnswering` on SQuAD v1.0 and SQuAD v2.0 tasks.
   - [`run_swag.py`](./examples/run_swag.py) - Show how to fine-tune an instance of `BertForMultipleChoice` on Swag task.
   - [`run_lm_finetuning.py`](./examples/run_lm_finetuning.py) - Show how to fine-tune an instance of `BertForPretraining' on a target text corpus.  
-   
+
+- One example on how to use **OpenAI GPT** (in the [`examples` folder](./examples)):
+  - [`openai_gpt_train.py`](./examples/openai_gpt_train.py) - Show how to fine-tune an instance of `OpenGPTDoubleHeadsModel` on the RocStories task.
+
+- Two examples on how to use **Transformer-XL** (in the [`examples` folder](./examples)):
+  - [`transfo_xl_train.py`](./examples/transfo_xl_train.py) - Show how to train and exaluate an instance of `TransfoXLModel` on WikiText 103,
+  - [`transfo_xl_eval.py`](./examples/transfo_xl_eval.py) - Simply exaluate a pre-trained model of `TransfoXLModel` on WikiText 103.
+
   These examples are detailed in the [Examples](#examples) section of this readme.
 
 - Three notebooks that were used to check that the TensorFlow and PyTorch models behave identically (in the [`notebooks` folder](./notebooks)):
@@ -105,7 +122,7 @@ The repository further comprises:
 
   These notebooks are detailed in the [Notebooks](#notebooks) section of this readme.
 
-- A command-line interface to convert any TensorFlow checkpoint (BERT) and NumPy checkpoint (OpenAI) in a PyTorch dump:
+- A command-line interface to convert TensorFlow checkpoints (BERT, Transformer-XL) or NumPy checkpoint (OpenAI) in a PyTorch save of the associated PyTorch model:
 
   This CLI is detailed in the [Command-line interface](#Command-line-interface) section of this readme.
 
@@ -222,6 +239,56 @@ predicted_index = torch.argmax(predictions[0, masked_index]).item()
 predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
 ```
 
+### Transformer-XL
+
+Here is a quick-start example using `OpenAIGPTTokenizer`, `OpenAIGPTModel` and `OpenAIGPTLMHeadModel` class with OpenAI's pre-trained  model. See the [doc section](#doc) below for all the details on these classes.
+
+First let's prepare a tokenized input with `OpenAIGPTTokenizer`
+
+```python
+import torch
+from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
+
+# Load pre-trained model tokenizer (vocabulary)
+tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+
+# Tokenized input
+text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+tokenized_text = tokenizer.tokenize(text)
+
+# Convert token to vocabulary indices
+indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+
+# Convert inputs to PyTorch tensors
+tokens_tensor = torch.tensor([indexed_tokens])
+```
+
+Let's see how to use `OpenAIGPTModel` to get hidden states
+
+```python
+# Load pre-trained model (weights)
+model = OpenAIGPTModel.from_pretrained('openai-gpt')
+model.eval()
+
+# Predict hidden states features for each layer
+hidden_states = model(tokens_tensor, segments_tensors)
+```
+
+And how to use `OpenAIGPTLMHeadModel`
+
+```python
+# Load pre-trained model (weights)
+model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
+model.eval()
+
+# Predict all tokens
+predictions = model(tokens_tensor)
+
+# get the predicted last token
+predicted_index = torch.argmax(predictions[0, masked_index]).item()
+predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+```
+
 ## Doc
 
 Here is a detailed documentation of the classes in the package and how to use them:
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index 55ac2bc892..fcb8e13949 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -193,6 +193,7 @@ class OpenAIGPTTokenizer(object):
         return word
 
     def tokenize(self, text):
+        """ Tokenize a string. """
         split_tokens = []
         text = self.nlp(text_standardize(self.fix_text(text)))
         for token in text:
@@ -200,7 +201,7 @@ class OpenAIGPTTokenizer(object):
         return split_tokens
 
     def convert_tokens_to_ids(self, tokens):
-        """Converts a sequence of tokens into ids using the vocab."""
+        """ Converts a sequence of tokens into ids using the vocab. """
         ids = []
         if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
             if tokens in self.special_tokens:

From 009b581316bf0d09bb3a902c72a83f26f8292b33 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 7 Feb 2019 23:15:05 +0100
Subject: [PATCH 43/82] updated readme

---
 README.md | 121 ++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 104 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 0783a62776..4f387adc5a 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ This PyTorch implementation of Transformer-XL is an adaptation of the original [
 
 ## Installation
 
-This repo was tested on Python 3.5+ and PyTorch 0.4.1/1.0.0
+This repo was tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 0.4.1/1.0.0
 
 ### With pip
 
@@ -78,13 +78,20 @@ This package comprises the following classes that can be imported in Python and
   - [`OpenAIGPTLMHeadModel`](./pytorch_pretrained_bert/modeling_openai.py#L691) - OpenAI GPT Transformer with the tied language modeling head on top (**fully pre-trained**),
   - [`OpenAIGPTDoubleHeadsModel`](./pytorch_pretrained_bert/modeling_openai.py#L752) - OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**),
 
+- Two **Transformer-XL** PyTorch models (`torch.nn.Module`) with pre-trained weights (in the [`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py) file):
+  - [`TransfoXLModel`](./pytorch_pretrained_bert/modeling_transfo_xl.py#L974) - Transformer-XL model which outputs the last hidden state and memory cells (**fully pre-trained**),
+  - [`TransfoXLLMHeadModel`](./pytorch_pretrained_bert/modeling_transfo_xl.py#L1236) - Transformer-XL with the tied adaptive softmax head on top for language modeling which outputs the logits/loss and memory cells (**fully pre-trained**),
+
 - Tokenizers for **BERT** (using word-piece) (in the [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) file):
   - `BasicTokenizer` - basic tokenization (punctuation splitting, lower casing, etc.),
   - `WordpieceTokenizer` - WordPiece tokenization,
   - `BertTokenizer` - perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
 
 - Tokenizer for **OpenAI GPT** (using Byte-Pair-Encoding) (in the [`tokenization_openai.py`](./pytorch_pretrained_bert/tokenization_openai.py) file):
-  - `OpenAIGPTTokenizer` - perform Byte-Pair-Encoding (BPE) tokenization,
+  - `OpenAIGPTTokenizer` - perform Byte-Pair-Encoding (BPE) tokenization.
+
+- Tokenizer for **Transformer-XL** (word tokens ordered by frequency for adaptive softmax) (in the [`tokenization_transfo_xl.py`](./pytorch_pretrained_bert/tokenization_transfo_xl.py) file):
+  - `OpenAIGPTTokenizer` - perform word tokenization and can order words by frequency in a corpus for use in an adaptive softmax.
 
 - Optimizer for **BERT** (in the [`optimization.py`](./pytorch_pretrained_bert/optimization.py) file):
   - `BertAdam` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
@@ -221,7 +228,7 @@ model = OpenAIGPTModel.from_pretrained('openai-gpt')
 model.eval()
 
 # Predict hidden states features for each layer
-hidden_states = model(tokens_tensor, segments_tensors)
+hidden_states = model(tokens_tensor)
 ```
 
 And how to use `OpenAIGPTLMHeadModel`
@@ -247,31 +254,37 @@ First let's prepare a tokenized input with `OpenAIGPTTokenizer`
 
 ```python
 import torch
-from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
+from pytorch_pretrained_bert import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel
 
-# Load pre-trained model tokenizer (vocabulary)
-tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+# Load pre-trained model tokenizer (vocabulary from wikitext 103)
+tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
 
 # Tokenized input
-text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-tokenized_text = tokenizer.tokenize(text)
+text_1 = "Who was Jim Henson ?"
+text_2 = "Jim Henson was a puppeteer"
+tokenized_text_1 = tokenizer.tokenize(text_1)
+tokenized_text_2 = tokenizer.tokenize(text_2)
 
 # Convert token to vocabulary indices
-indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
+indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
 
 # Convert inputs to PyTorch tensors
-tokens_tensor = torch.tensor([indexed_tokens])
+tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 ```
 
-Let's see how to use `OpenAIGPTModel` to get hidden states
+Let's see how to use `TransfoXLModel` to get hidden states
 
 ```python
 # Load pre-trained model (weights)
-model = OpenAIGPTModel.from_pretrained('openai-gpt')
+model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
 model.eval()
 
 # Predict hidden states features for each layer
-hidden_states = model(tokens_tensor, segments_tensors)
+hidden_states_1, mems_1 = model(tokens_tensor_1)
+# We can re-use the memory cells in a subsequent call to attend a longer context
+hidden_states_2, mems_2 = model(tokens_tensor_2, mems_1)
 ```
 
 And how to use `OpenAIGPTLMHeadModel`
@@ -282,10 +295,12 @@ model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
 model.eval()
 
 # Predict all tokens
-predictions = model(tokens_tensor)
+predictions_1, mems_1 = model(tokens_tensor_1)
+# We can re-use the memory cells in a subsequent call to attend a longer context
+predictions_2, mems_2 = model(tokens_tensor_2, mems_1)
 
 # get the predicted last token
-predicted_index = torch.argmax(predictions[0, masked_index]).item()
+predicted_index = torch.argmax(predictions_1[0, masked_index]).item()
 predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
 ```
 
@@ -323,11 +338,12 @@ where
     - `bert-base-multilingual-cased`: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
     - `bert-base-chinese`: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
     - `openai-gpt`: OpenAI English model, 12-layer, 768-hidden, 12-heads, 110M parameters
+    - `transfo-xl-wt103`: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
 
   - a path or url to a pretrained model archive containing:
 
     - `bert_config.json` or `openai_gpt_config.json` a configuration file for the model, and
-    - `pytorch_model.bin` a PyTorch dump of a pre-trained instance of `BertForPreTraining` or `OpenAIGPTModel` (saved with the usual `torch.save()`)
+    - `pytorch_model.bin` a PyTorch dump of a pre-trained instance of `BertForPreTraining`, `OpenAIGPTModel` or `TransfoXLModel` (saved with the usual `torch.save()`)
 
   If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_pretrained_bert/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_pretrained_bert/`).
 - `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information).
@@ -345,6 +361,10 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 # OpenAI GPT
 tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
 model = OpenAIGPTModel.from_pretrained('openai-gpt')
+
+# Transformer-XL
+tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
 ```
 
 ### PyTorch models
@@ -523,6 +543,37 @@ This model *outputs*:
   - `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings]
   - `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
 
+#### 12. `TransfoXLModel`
+
+The Transformer-XL model is described in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context".
+
+Transformer XL use a relative positioning with sinusiodal patterns and adaptive softmax inputs which means that:
+
+- you don't need to specify positioning embeddings indices
+- the tokens in the vocabulary have to be sorted to decreasing frequency.
+
+This model takes as *inputs*:
+[`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py)
+- `input_ids`: a torch.LongTensor of shape [sequence_length, batch_size] with the token indices selected in the range [0, self.config.n_token[
+- `mems`: an optional memory of hidden states from previous forward passes as a list (num layers) of hidden states at the entry of each layer. Each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
+
+This model *outputs* a tuple of (last_hidden_state, new_mems)
+- `last_hidden_state`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [sequence_length, batch_size, self.config.d_model]
+- `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
+
+#### 13. `TransfoXLLMHeadModel`
+
+`TransfoXLLMHeadModel` includes the `TransfoXLModel` Transformer followed by an (adaptive) softmax head with weights tied to the input embeddings.
+
+*Inputs* are the same as the inputs of the [`TransfoXLModel`](#-12.-`TransfoXLModel`) class plus optional labels:
+- `target`: an optional torch.LongTensor of shape [sequence_length, batch_size] with the target token indices selected in the range [0, self.config.n_token[
+
+*Outputs* a tuple of (last_hidden_state, new_mems)
+- `softmax_output`: output of the (adaptive) softmax:
+  - if target is None: Negative log likelihood of shape :: [len, bsz]
+  - else: log probabilities of tokens, shape :: [len, bsz, n_tokens]
+- `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
+
 
 ### Tokenizers:
 
@@ -547,7 +598,7 @@ Please refer to the doc strings and code in [`tokenization.py`](./pytorch_pretra
 
 `OpenAIGPTTokenizer` perform Byte-Pair-Encoding (BPE) tokenization.
 
-This class has one arguments:
+This class has two arguments:
 
 - `vocab_file`: path to a vocabulary file.
 - `merges_file`: path to a file containing the BPE merges.
@@ -560,6 +611,12 @@ and three methods:
 
 Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch_pretrained_bert/tokenization_openai.py) for the details of the `OpenAIGPTTokenizer`.
 
+#### `TransfoXLTokenizer`
+
+`TransfoXLTokenizer` perform word tokenization.
+
+Please refer to the doc strings and code in [`tokenization_transfo_xl.py`](./pytorch_pretrained_bert/tokenization_transfo_xl.py) for the details of the `TransfoXLTokenizer`.
+
 ### Optimizers:
 
 #### `BertAdam`
@@ -758,6 +815,36 @@ python run_lm_finetuning.py \
   --max_seq_length 128 \
 ```
 
+### OpenAI GPT and Transformer-XL: running the examples
+
+We provied two examples of scripts for OpenAI GPT and Transformer-XL based on (and extended from) the respective original implementations:
+
+#### Fine-tuning OpenAI GPT on the RocStories dataset
+
+This example code fine-tunes OpenAI GPT on the RocStories dataset.
+
+Before running this example you should download the
+[RocStories dataset](https://github.com/snigdhac/StoryComprehension_EMNLP/tree/master/Dataset/RoCStories) and unpack it to some directory `$ROC_STORIES_DIR`.
+
+```shell
+export ROC_STORIES_DIR=/path/to/RocStories
+
+python train_openai_gpt.py \
+  --task_name MRPC \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir $GLUE_DIR/MRPC/ \
+  --bert_model bert-base-uncased \
+  --max_seq_length 128 \
+  --train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3.0 \
+  --output_dir /tmp/mrpc_output/
+```
+
+Our test ran on a few seeds with [the original implementation hyper-parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation results between 84% and 88%.
+
 ## Fine-tuning BERT-large on GPUs
 
 The options we list above allow to fine-tune BERT-large rather easily on GPU(s) instead of the TPU used by the original implementation.

From e77721e4fe8f2665132bf11ab26c3ab352e2f2a2 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 7 Feb 2019 23:15:15 +0100
Subject: [PATCH 44/82] renamed examples

---
 examples/{openai_gpt_train.py => train_openai_gpt.py} | 0
 examples/{transfo_xl_train.py => train_transfo_xl.py} | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/{openai_gpt_train.py => train_openai_gpt.py} (100%)
 rename examples/{transfo_xl_train.py => train_transfo_xl.py} (99%)

diff --git a/examples/openai_gpt_train.py b/examples/train_openai_gpt.py
similarity index 100%
rename from examples/openai_gpt_train.py
rename to examples/train_openai_gpt.py
diff --git a/examples/transfo_xl_train.py b/examples/train_transfo_xl.py
similarity index 99%
rename from examples/transfo_xl_train.py
rename to examples/train_transfo_xl.py
index 09d30aed28..6ea0920489 100644
--- a/examples/transfo_xl_train.py
+++ b/examples/train_transfo_xl.py
@@ -437,7 +437,7 @@ def evaluate(eval_iter):
             if args.max_eval_steps > 0 and i >= args.max_eval_steps:
                 break
             ret = model(data, target, *mems)
-            loss, mems = ret[0], ret[1:]
+            loss, mems = ret
             loss = loss.mean()
             total_loss += seq_len * loss.float().item()
             total_len += seq_len

From eb8fda51f416a1f621541cb132a644b3cea276ac Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 7 Feb 2019 23:15:20 +0100
Subject: [PATCH 45/82] update docstrings

---
 pytorch_pretrained_bert/modeling_transfo_xl.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index f3a3eb46fe..8f3ccb7283 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -984,7 +984,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
     Inputs:
         `input_ids`: a torch.LongTensor of shape [sequence_length, batch_size]
             with the token indices selected in the range [0, self.config.n_token[
-
+        `mems`: optional memomry of hidden states from previous forward passes
+            as a list (num layers) of hidden states at the entry of each layer
+            each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
     Outputs:
         A tuple of (last_hidden_state, new_mems)
         `last_hidden_state`: the encoded-hidden-states at the top of the model
@@ -1220,6 +1222,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
     def forward(self, input_ids, mems=None):
         """ Params:
                 input_ids :: [len, bsz]
+                mems :: optional mems from previous forwar passes (or init_mems)
+                    list (num layers) of mem states at the entry of each layer
+                        shape :: [self.config.mem_len, bsz, self.config.d_model]
             Returns:
                 tuple (last_hidden, new_mems) where:
                     new_mems: list (num layers) of mem states at the entry of each layer
@@ -1250,8 +1255,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
     Inputs:
         `input_ids`: a torch.LongTensor of shape [sequence_length, batch_size]
             with the token indices selected in the range [0, self.config.n_token[
-        `target`: a torch.LongTensor of shape [sequence_length, batch_size]
+        `target`: an optional torch.LongTensor of shape [sequence_length, batch_size]
             with the target token indices selected in the range [0, self.config.n_token[
+        `mems`: an optional memory of hidden states from previous forward passes
+            as a list (num layers) of hidden states at the entry of each layer
+            each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
 
     Outputs:
         A tuple of (last_hidden_state, new_mems)

From 6bc082da0aedff4c6128356fbef8e101fc23d635 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Feb 2019 00:02:26 +0100
Subject: [PATCH 46/82] updating examples

---
 examples/run_openai_gpt.py   | 229 ++++++++++++++
 examples/run_transfo_xl.py   | 142 +++++++++
 examples/train_openai_gpt.py | 344 --------------------
 examples/train_transfo_xl.py | 595 -----------------------------------
 examples/transfo_xl_eval.py  | 139 --------
 5 files changed, 371 insertions(+), 1078 deletions(-)
 create mode 100644 examples/run_openai_gpt.py
 create mode 100644 examples/run_transfo_xl.py
 delete mode 100644 examples/train_openai_gpt.py
 delete mode 100644 examples/train_transfo_xl.py
 delete mode 100644 examples/transfo_xl_eval.py

diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
new file mode 100644
index 0000000000..4f76407958
--- /dev/null
+++ b/examples/run_openai_gpt.py
@@ -0,0 +1,229 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" OpenAI GPT model fine-tuning script.
+    Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py
+    It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py
+
+    This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset
+"""
+import argparse
+import os
+import csv
+import random
+import logging
+from tqdm import tqdm, trange
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+
+from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+def accuracy(out, labels):
+    outputs = np.argmax(out, axis=1)
+    return np.sum(outputs == labels)
+
+def load_rocstories_dataset(dataset_path):
+    """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
+    with open(dataset_path, encoding='utf_8') as f:
+        f = csv.reader(f)
+        output = []
+        next(f) # skip the first line
+        for line in tqdm(f):
+            output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
+    return output
+
+def pre_process_datasets(encoded_datasets, max_len, start_token, delimiter_token, clf_token):
+    """ Pre-process datasets containing lists of
+        tuples(story, 1st continuation, 2nd continuation, label)
+        
+        In Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
+        input_ids[batch, alternative, :] = [start_token] + story[:max_len] + [delimiter_token] + cont1[:max_len] + [clf_token]
+    """
+    tensor_datasets = []
+    for dataset in encoded_datasets:
+        n_batch = len(dataset)
+        input_ids = np.zeros((n_batch, 2, max_len), dtype=np.int32)
+        mc_token_mask = np.zeros((n_batch, 2, max_len), dtype=np.int32)
+        lm_labels = np.full((n_batch, 2, max_len), -1, dtype=np.float32)
+        mc_labels = np.zeros((n_batch,), dtype=np.float32)
+        for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
+            with_cont1 = [start_token] + story[:max_len] + [delimiter_token] + cont1[:max_len] + [clf_token]
+            with_cont2 = [start_token] + story[:max_len] + [delimiter_token] + cont2[:max_len] + [clf_token]
+            input_ids[i, 0, :len(with_cont1)] = with_cont1
+            input_ids[i, 1, :len(with_cont2)] = with_cont2
+            mc_token_mask[i, 0, len(with_cont1) - 1] = 1
+            lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:]
+            lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:]
+            mc_labels[i] = mc_label
+        all_inputs = tuple(input_ids, mc_token_mask, lm_labels, mc_labels)
+        tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
+    return tensor_datasets
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_name', type=str, default='openai-gpt',
+                        help='pretrained model name')
+    parser.add_argument('--train_dataset', type=str, default='cloze_test_val__spring2016 - cloze_test_ALL_val.tsv')
+    parser.add_argument('--eval_dataset', type=str, default='test_spring2016.tsv')
+    parser.add_argument('--seed', type=int, default=42)
+    parser.add_argument('--num_train_epochs', type=int, default=3)
+    parser.add_argument('--train_batch_size', type=int, default=8)
+    parser.add_argument('--eval_batch_size', type=int, default=16)
+    parser.add_argument('--max_grad_norm', type=int, default=1)
+    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
+    parser.add_argument('--warmup_proportion', type=float, default=0.002)
+    parser.add_argument('--max_grad_norm', type=float, default=1)
+    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
+    parser.add_argument('--weight_decay', type=float, default=0.01)
+    parser.add_argument('--lm_coef', type=float, default=0.5)
+    parser.add_argument('--n_valid', type=int, default=374)
+    args = parser.parse_args()
+    print(args)
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed_all(args.seed)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    n_gpu = torch.cuda.device_count()
+    logger.info("device: {}, n_gpu {}".format(device, n_gpu))
+
+    # Load tokenizer and model
+    # This loading functions also add new tokens and embeddings called `special tokens`
+    # These new embeddings will be fine-tuned on the RocStories dataset
+    special_tokens = ['_start_', '_delimiter_', '_classify_']
+    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
+    special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
+    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
+
+    # Load and encode the datasets
+    logger.info("Encoding dataset...")
+    train_dataset = load_rocstories_dataset(args.train_dataset)
+    eval_datset = load_rocstories_dataset(args.eval_datset)
+    datasets = (train_dataset, eval_datset)
+    tokenized_datasets = tuple(list(list(tokenizer.tokenize(x) for x in instance)
+                                         for instance in dataset) for dataset in datasets)
+    encoded_datasets = tuple(list(list(tokenizer.convert_tokens_to_ids(x) for x in instance)
+                                       for instance in dataset) for dataset in tokenized_datasets)
+
+    # Compute the mex input length for the Transformer
+    max_input_length = max(len(story) + max(len(cont1), len(cont2)) + 3  \
+                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
+    max_input_length = min(max_input_length, model.config.n_positions)  # Max size of input for the pre-trained model
+    max_sub_part_length = max_input_length // 2 - 2
+
+    # Prepare inputs tensors and dataloaders
+    tensor_datasets = pre_process_datasets(encoded_datasets, max_sub_part_length, *special_tokens_ids)
+    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]
+
+    train_data = TensorDataset(*train_tensor_dataset)
+    train_sampler = RandomSampler(train_data)
+    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    eval_data = TensorDataset(*eval_tensor_dataset)
+    eval_sampler = SequentialSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # Prepare optimizer
+    param_optimizer = list(model.named_parameters())
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    num_train_optimization_steps = len(train_data) // args.train_batch_size
+    optimizer = OpenAIAdam(optimizer_grouped_parameters,
+                           lr=args.learning_rate,
+                           warmup=args.warmup_proportion,
+                           max_grad_norm=args.max_grad_norm,
+                           weight_decay=args.weight_decay,
+                           t_total=num_train_optimization_steps)
+
+    if args.do_train:
+        nb_tr_steps = 0
+        tr_loss = 0
+        model.train()
+        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
+            tr_loss = 0
+            nb_tr_examples, nb_tr_steps = 0, 0
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, mc_token_mask, lm_labels, mc_labels = batch
+                losses = model(input_ids, mc_token_mask, lm_labels, mc_labels)
+                loss = args.lm_coef * losses[0] + losses[1]
+                loss.backward()
+                optimizer.step()
+                tr_loss += loss.item()
+                nb_tr_examples += input_ids.size(0)
+                nb_tr_steps += 1
+
+    # Save a trained model
+    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
+    if args.do_train:
+        torch.save(model_to_save.state_dict(), output_model_file)
+
+    # Load a trained model that you have fine-tuned
+    model_state_dict = torch.load(output_model_file)
+    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, state_dict=model_state_dict,
+                                                      num_special_tokens=len(special_tokens))
+    model.to(device)
+
+    if args.do_eval:
+        model.eval()
+        eval_loss, eval_accuracy = 0, 0
+        nb_eval_steps, nb_eval_examples = 0, 0
+        for batch in tqdm(eval_dataloader, desc="Evaluating"):
+            batch = tuple(t.to(device) for t in batch)
+            input_ids, mc_token_mask, lm_labels, mc_labels = batch
+            with torch.no_grad():
+                _, mc_loss = model(input_ids, mc_token_mask, lm_labels, mc_labels)
+                _, mc_logits = model(input_ids, mc_token_mask)
+
+            mc_logits = mc_logits.detach().cpu().numpy()
+            mc_labels = mc_labels.to('cpu').numpy()
+            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)
+
+            eval_loss += mc_loss.mean().item()
+            eval_accuracy += tmp_eval_accuracy
+
+            nb_eval_examples += input_ids.size(0)
+            nb_eval_steps += 1
+
+        eval_loss = eval_loss / nb_eval_steps
+        eval_accuracy = eval_accuracy / nb_eval_examples
+        train_loss = tr_loss/nb_tr_steps if args.do_train else None
+        result = {'eval_loss': eval_loss,
+                  'eval_accuracy': eval_accuracy,
+                  'train_loss': train_loss}
+
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/run_transfo_xl.py b/examples/run_transfo_xl.py
new file mode 100644
index 0000000000..1218a1f547
--- /dev/null
+++ b/examples/run_transfo_xl.py
@@ -0,0 +1,142 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Transformer XL model evaluation script.
+    Adapted from https://github.com/kimiyoung/transformer-xl.
+    In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
+
+    This script with default values evaluates a pretrained Transformer-XL on WikiText 103
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import logging
+import time
+import math
+
+import torch
+
+from pytorch_pretrained_bert import TransfoXLModel, TransfoXLCorpus
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+def main():
+    parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
+    parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
+                        help='pretrained model name')
+    parser.add_argument('--split', type=str, default='test',
+                        choices=['all', 'valid', 'test'],
+                        help='which split to evaluate')
+    parser.add_argument('--batch_size', type=int, default=10,
+                        help='batch size')
+    parser.add_argument('--tgt_len', type=int, default=128,
+                        help='number of tokens to predict')
+    parser.add_argument('--ext_len', type=int, default=0,
+                        help='length of the extended context')
+    parser.add_argument('--mem_len', type=int, default=1600,
+                        help='length of the retained previous heads')
+    parser.add_argument('--clamp_len', type=int, default=1000,
+                        help='max positional embedding index')
+    parser.add_argument('--cuda', action='store_true',
+                        help='use CUDA')
+    parser.add_argument('--work_dir', type=str, required=True,
+                        help='path to the work_dir')
+    parser.add_argument('--no_log', action='store_true',
+                        help='do not log the eval result')
+    parser.add_argument('--same_length', action='store_true',
+                        help='set same length attention with masking')
+    args = parser.parse_args()
+    assert args.ext_len >= 0, 'extended context length must be non-negative'
+
+    device = torch.device("cuda" if args.cuda else "cpu")
+
+    # Load a pre-processed dataset
+    # You can also build the corpus yourself using TransfoXLCorpus methods
+    # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
+    # and tokenizing the dataset
+    # The pre-processed corpus is a convertion (using the conversion script )
+    corpus = TransfoXLCorpus.from_pretrained(args.model_name)
+    ntokens = len(corpus.vocab)
+
+    va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
+        device=device, ext_len=args.ext_len)
+    te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
+        device=device, ext_len=args.ext_len)
+
+    # Load a pre-trained model
+    model = TransfoXLModel.from_pretrained(args.model_name)
+    model = model.to(device)
+
+    logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
+        args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
+
+    model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
+    if args.clamp_len > 0:
+        model.clamp_len = args.clamp_len
+    if args.same_length:
+        model.same_length = True
+
+    ###############################################################################
+    # Evaluation code
+    ###############################################################################
+    def evaluate(eval_iter):
+        # Turn on evaluation mode which disables dropout.
+        model.eval()
+        total_len, total_loss = 0, 0.
+        start_time = time.time()
+        with torch.no_grad():
+            mems = tuple()
+            for idx, (data, target, seq_len) in enumerate(eval_iter):
+                ret = model(data, target, *mems)
+                loss, mems = ret
+                loss = loss.mean()
+                total_loss += seq_len * loss.item()
+                total_len += seq_len
+            total_time = time.time() - start_time
+        logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
+                total_time, 1000 * total_time / (idx+1)))
+        return total_loss / total_len
+
+    # Run on test data.
+    if args.split == 'all':
+        test_loss = evaluate(te_iter)
+        valid_loss = evaluate(va_iter)
+    elif args.split == 'valid':
+        valid_loss = evaluate(va_iter)
+        test_loss = None
+    elif args.split == 'test':
+        test_loss = evaluate(te_iter)
+        valid_loss = None
+
+    def format_log(loss, split):
+        log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
+            split, loss, math.exp(loss))
+        return log_str
+
+    log_str = ''
+    if valid_loss is not None:
+        log_str += format_log(valid_loss, 'valid')
+    if test_loss is not None:
+        log_str += format_log(test_loss, 'test')
+
+    logger.info('=' * 100)
+    logger.info(log_str)
+    logger.info('=' * 100)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/examples/train_openai_gpt.py b/examples/train_openai_gpt.py
deleted file mode 100644
index 7a3dd90988..0000000000
--- a/examples/train_openai_gpt.py
+++ /dev/null
@@ -1,344 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" OpenAI GPT model fine-tuning script.
-    Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py
-    It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py
-
-    This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset
-"""
-import argparse
-import os
-import csv
-import random
-import logging
-from tqdm import tqdm
-
-import numpy as np
-import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
-from sklearn.metrics import accuracy_score
-from sklearn.utils import shuffle
-
-from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam
-
-# from analysis import rocstories as rocstories_analysis
-# from datasets import rocstories
-# from model_pytorch import DoubleHeadModel, load_openai_pretrained_model
-# from opt import OpenAIAdam
-# from text_utils import TextEncoder
-# from utils import (encode_dataset, iter_data,
-#                    ResultLogger, make_path)
-# from loss import MultipleChoiceLossCompute
-
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
-logger = logging.getLogger(__name__)
-
-def iter_apply(Xs, Ms, Ys):
-    # fns = [lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x))]
-    logits = []
-    cost = 0
-    with torch.no_grad():
-        dh_model.eval()
-        for xmb, mmb, ymb in iter_data(Xs, Ms, Ys, n_batch=n_batch_train, truncate=False, verbose=True):
-            n = len(xmb)
-            XMB = torch.tensor(xmb, dtype=torch.long).to(device)
-            YMB = torch.tensor(ymb, dtype=torch.long).to(device)
-            MMB = torch.tensor(mmb).to(device)
-            _, clf_logits = dh_model(XMB)
-            clf_logits *= n
-            clf_losses = compute_loss_fct(XMB, YMB, MMB, clf_logits, only_return_losses=True)
-            clf_losses *= n
-            logits.append(clf_logits.to("cpu").numpy())
-            cost += clf_losses.sum().item()
-        logits = np.concatenate(logits, 0)
-    return logits, cost
-
-
-def iter_predict(Xs, Ms):
-    logits = []
-    with torch.no_grad():
-        dh_model.eval()
-        for xmb, mmb in iter_data(Xs, Ms, n_batch=n_batch_train, truncate=False, verbose=True):
-            n = len(xmb)
-            XMB = torch.tensor(xmb, dtype=torch.long).to(device)
-            MMB = torch.tensor(mmb).to(device)
-            _, clf_logits = dh_model(XMB)
-            logits.append(clf_logits.to("cpu").numpy())
-    logits = np.concatenate(logits, 0)
-    return logits
-
-
-def log(save_dir, desc):
-    global best_score
-    print("Logging")
-    tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid], trY[:n_valid])
-    va_logits, va_cost = iter_apply(vaX, vaM, vaY)
-    tr_cost = tr_cost / len(trY[:n_valid])
-    va_cost = va_cost / n_valid
-    tr_acc = accuracy_score(trY[:n_valid], np.argmax(tr_logits, 1)) * 100.
-    va_acc = accuracy_score(vaY, np.argmax(va_logits, 1)) * 100.
-    logger.log(n_epochs=n_epochs, n_updates=n_updates, tr_cost=tr_cost, va_cost=va_cost, tr_acc=tr_acc, va_acc=va_acc)
-    print('%d %d %.3f %.3f %.2f %.2f' % (n_epochs, n_updates, tr_cost, va_cost, tr_acc, va_acc))
-    if submit:
-        score = va_acc
-        if score > best_score:
-            best_score = score
-            path = os.path.join(save_dir, desc, 'best_params')
-            torch.save(dh_model.state_dict(), make_path(path))
-
-
-def predict(dataset, submission_dir):
-    filename = filenames[dataset]
-    pred_fn = pred_fns[dataset]
-    label_decoder = label_decoders[dataset]
-    predictions = pred_fn(iter_predict(teX, teM))
-    if label_decoder is not None:
-        predictions = [label_decoder[prediction] for prediction in predictions]
-    path = os.path.join(submission_dir, filename)
-    os.makedirs(os.path.dirname(path), exist_ok=True)
-    with open(path, 'w') as f:
-        f.write('{}\t{}\n'.format('index', 'prediction'))
-        for i, prediction in enumerate(predictions):
-            f.write('{}\t{}\n'.format(i, prediction))
-
-
-def run_epoch():
-    for xmb, mmb, ymb in iter_data(*shuffle(trX, trM, trYt, random_state=np.random),
-                                   n_batch=n_batch_train, truncate=True, verbose=True):
-        global n_updates
-        dh_model.train()
-        XMB = torch.tensor(xmb, dtype=torch.long).to(device)
-        YMB = torch.tensor(ymb, dtype=torch.long).to(device)
-        MMB = torch.tensor(mmb).to(device)
-        lm_logits, clf_logits = dh_model(XMB)
-        compute_loss_fct(XMB, YMB, MMB, clf_logits, lm_logits)
-        n_updates += 1
-        if n_updates in [1000, 2000, 4000, 8000, 16000, 32000] and n_epochs == 0:
-            log(save_dir, desc)
-
-
-def accuracy(out, labels):
-    outputs = np.argmax(out, axis=1)
-    return np.sum(outputs == labels)
-
-def load_rocstories_dataset(dataset_path):
-    """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
-    with open(dataset_path, encoding='utf_8') as f:
-        f = csv.reader(f)
-        output = []
-        next(f) # skip the first line
-        for line in tqdm(f):
-            output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
-    return output
-
-def pre_process_dataset(encoded_dataset, max_len, start_token, delimiter_token, clf_token):
-    n_batch = len(dataset)
-    input_ids = np.zeros((n_batch, 2, max_len), dtype=np.int32)
-    mc_token_mask = np.zeros((n_batch, 2, max_len), dtype=np.int32)
-    lm_labels = np.full((n_batch, 2, max_len), -1, dtype=np.float32)
-    mc_labels = np.zeros((n_batch,), dtype=np.float32)
-    for i, (story, cont1, cont2, mc_label), in enumerate(encoded_dataset):
-        with_cont1 = [start_token] + story[:max_len] + [delimiter_token] + cont1[:max_len] + [clf_token]
-        with_cont2 = [start_token] + story[:max_len] + [delimiter_token] + cont2[:max_len] + [clf_token]
-        xmb[i, 0, :len(with_cont1)] = with_cont1
-        xmb[i, 1, :len(with_cont2)] = with_cont2
-        mc_token_mask[i, 0, len(with_cont1) - 1] = 1
-        lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:]
-        lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:]
-        mc_labels[i] = mc_label
-    all_inputs = (input_ids, mc_token_mask, lm_labels, mc_labels)
-    all_input_tensors = list(torch.tensor(t) for t in all_inputs)
-    return all_input_tensors
-
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name', type=str, default='openai-gpt',
-                        help='pretrained model name')
-    parser.add_argument('--data_dir', type=str, default='data/')
-    parser.add_argument('--seed', type=int, default=42)
-    parser.add_argument('--num_train_epochs', type=int, default=3)
-    parser.add_argument('--train_batch_size', type=int, default=8)
-    parser.add_argument('--max_grad_norm', type=int, default=1)
-    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
-    parser.add_argument('--warmup_proportion', type=float, default=0.002)
-    parser.add_argument('--max_grad_norm', type=float, default=1)
-    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
-    parser.add_argument('--weight_decay', type=float, default=0.01)
-    parser.add_argument('--lm_coef', type=float, default=0.5)
-    parser.add_argument('--n_valid', type=int, default=374)
-    args = parser.parse_args()
-    print(args)
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    torch.cuda.manual_seed_all(args.seed)
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    n_gpu = torch.cuda.device_count()
-    logger.info("device", device, "n_gpu", n_gpu)
-
-    # Load tokenizer and model
-    # This loading functions also add new tokens and embeddings called `special tokens`
-    # These new embeddings will be fine-tuned on the RocStories dataset
-    special_tokens = ['_start_', '_delimiter_', '_classify_']
-    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
-    special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
-    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
-
-    # Load the dataset and prepare the inputs
-    logger.info("Encoding dataset...")
-    dataset = load_rocstories_dataset(args.dataset_path)
-    tokenized_dataset = list(list(tokenizer.tokenize(x) for x in instance) for instance in dataset)
-    encoded_dataset = list(list(tokenizer.convert_tokens_to_ids(x) for x in instance) for instance in tokenized_dataset)
-
-    max_input_length = max(len(story)+max(len(cont1), len(cont2))+3 for story, cont1, cont2, _ in encoded_dataset)
-    max_input_length = min(max_input_length, model.config.n_positions)  # Max size of input for the pre-trained model
-    max_sub_part_length = max_input_length // 2 - 2
-
-    # Prepare dataloader
-    dataset_tensors = pre_process_dataset(encoded_dataset, max_sub_part_length, *special_tokens_ids)
-    train_data = TensorDataset(*dataset_tensors)
-    train_sampler = RandomSampler(train_data)
-    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    # Prepare optimizer
-    param_optimizer = list(model.named_parameters())
-    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-    optimizer_grouped_parameters = [
-        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-        ]
-    num_train_optimization_steps = len(train_data) // args.train_batch_size
-    optimizer = OpenAIAdam(optimizer_grouped_parameters,
-                           lr=args.learning_rate,
-                           warmup=args.warmup_proportion,
-                           max_grad_norm=args.max_grad_norm,
-                           weight_decay=arsg.weight_decay,
-                           t_total=num_train_optimization_steps)
-
-    if args.do_train:
-        global_step = 0
-        nb_tr_steps = 0
-        tr_loss = 0
-        model.train()
-        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
-            tr_loss = 0
-            nb_tr_examples, nb_tr_steps = 0, 0
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
-                batch = tuple(t.to(device) for t in batch)
-                input_ids, mc_token_mask, lm_labels, mc_labels = batch
-                losses = model(input_ids, mc_token_mask, lm_labels, mc_labels)
-                loss = args.lm_coef * losses[0] + losses[1]
-                loss.backward()
-                tr_loss += loss.item()
-                nb_tr_examples += input_ids.size(0)
-                nb_tr_steps += 1
-
-    # Save a trained model
-    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
-    if args.do_train:
-        torch.save(model_to_save.state_dict(), output_model_file)
-
-    # Load a trained model that you have fine-tuned
-    model_state_dict = torch.load(output_model_file)
-    model = OpenAIGPTDoubleHeadsModel(args.mode, state_dict=model_state_dict, num_labels=num_labels)
-    model.to(device)
-
-    if args.do_eval:
-        eval_examples = processor.get_dev_examples(args.data_dir)
-        eval_features = convert_examples_to_features(
-            eval_examples, label_list, args.max_seq_length, tokenizer)
-        logger.info("***** Running evaluation *****")
-        logger.info("  Num examples = %d", len(eval_examples))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
-        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-        # Run prediction for full data
-        eval_sampler = SequentialSampler(eval_data)
-        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-        model.eval()
-        eval_loss, eval_accuracy = 0, 0
-        nb_eval_steps, nb_eval_examples = 0, 0
- 
-        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.to(device)
-            segment_ids = segment_ids.to(device)
-            label_ids = label_ids.to(device)
-
-            with torch.no_grad():
-                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
-                logits = model(input_ids, segment_ids, input_mask)
-
-            logits = logits.detach().cpu().numpy()
-            label_ids = label_ids.to('cpu').numpy()
-            tmp_eval_accuracy = accuracy(logits, label_ids)
-
-            eval_loss += tmp_eval_loss.mean().item()
-            eval_accuracy += tmp_eval_accuracy
-
-            nb_eval_examples += input_ids.size(0)
-            nb_eval_steps += 1
-
-        eval_loss = eval_loss / nb_eval_steps
-        eval_accuracy = eval_accuracy / nb_eval_examples
-        loss = tr_loss/nb_tr_steps if args.do_train else None
-        result = {'eval_loss': eval_loss,
-                  'eval_accuracy': eval_accuracy,
-                  'global_step': global_step,
-                  'loss': loss}
-
-        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results *****")
-            for key in sorted(result.keys()):
-                logger.info("  %s = %s", key, str(result[key]))
-                writer.write("%s = %s\n" % (key, str(result[key])))
-
-if __name__ == "__main__":
-    main()
-
-    n_updates = 0
-    n_epochs = 0
-    if dataset != 'stsb':
-        trYt = trY
-    if submit:
-        path = os.path.join(save_dir, desc, 'best_params')
-        torch.save(dh_model.state_dict(), make_path(path))
-    best_score = 0
-    for i in range(args.n_iter):
-        print("running epoch", i)
-        run_epoch()
-        n_epochs += 1
-        log(save_dir, desc)
-    if submit:
-        path = os.path.join(save_dir, desc, 'best_params')
-        dh_model.load_state_dict(torch.load(path))
-        predict(dataset, args.submission_dir)
-        if args.analysis:
-            rocstories_analysis(data_dir, os.path.join(args.submission_dir, 'ROCStories.tsv'),
-                                os.path.join(log_dir, 'rocstories.jsonl'))
diff --git a/examples/train_transfo_xl.py b/examples/train_transfo_xl.py
deleted file mode 100644
index 6ea0920489..0000000000
--- a/examples/train_transfo_xl.py
+++ /dev/null
@@ -1,595 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Transformer XL model training script.
-    Adapted from https://github.com/kimiyoung/transformer-xl.
-    In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
-
-    This script with default values train a Transformer-XL on WikiText 103
-"""
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import os
-import functools
-import argparse
-import logging
-import time
-import math
-import sys
-from io import open
-import itertools
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.optim as optim
-
-from pytorch_pretrained_bert import TransfoXLModel, TransfoXLConfig
-from pytorch_pretrained_bert.tokenization_transfo_xl import get_lm_corpus
-
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
-parser.add_argument('--data', type=str, default='../data/wikitext-103',
-                    help='location of the data corpus')
-parser.add_argument('--dataset', type=str, default='wt103',
-                    choices=['wt103', 'lm1b', 'enwik8', 'text8'],
-                    help='dataset name')
-parser.add_argument('--n_layer', type=int, default=12,
-                    help='number of total layers')
-parser.add_argument('--n_head', type=int, default=10,
-                    help='number of heads')
-parser.add_argument('--d_head', type=int, default=50,
-                    help='head dimension')
-parser.add_argument('--d_embed', type=int, default=-1,
-                    help='embedding dimension')
-parser.add_argument('--d_model', type=int, default=500,
-                    help='model dimension')
-parser.add_argument('--d_inner', type=int, default=1000,
-                    help='inner dimension in FF')
-parser.add_argument('--dropout', type=float, default=0.0,
-                    help='global dropout rate')
-parser.add_argument('--dropatt', type=float, default=0.0,
-                    help='attention probability dropout rate')
-parser.add_argument('--init', default='normal', type=str,
-                    help='parameter initializer to use.')
-parser.add_argument('--emb_init', default='normal', type=str,
-                    help='parameter initializer to use.')
-parser.add_argument('--init_range', type=float, default=0.1,
-                    help='parameters initialized by U(-init_range, init_range)')
-parser.add_argument('--emb_init_range', type=float, default=0.01,
-                    help='parameters initialized by U(-init_range, init_range)')
-parser.add_argument('--init_std', type=float, default=0.02,
-                    help='parameters initialized by N(0, init_std)')
-parser.add_argument('--proj_init_std', type=float, default=0.01,
-                    help='parameters initialized by N(0, init_std)')
-parser.add_argument('--optim', default='adam', type=str,
-                    choices=['adam', 'sgd', 'adagrad'],
-                    help='optimizer to use.')
-parser.add_argument('--lr', type=float, default=0.00025,
-                    help='initial learning rate (0.00025|5 for adam|sgd)')
-parser.add_argument('--mom', type=float, default=0.0,
-                    help='momentum for sgd')
-parser.add_argument('--scheduler', default='cosine', type=str,
-                    choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'],
-                    help='lr scheduler to use.')
-parser.add_argument('--warmup_step', type=int, default=0,
-                    help='upper epoch limit')
-parser.add_argument('--decay_rate', type=float, default=0.5,
-                    help='decay factor when ReduceLROnPlateau is used')
-parser.add_argument('--lr_min', type=float, default=0.0,
-                    help='minimum learning rate during annealing')
-parser.add_argument('--clip', type=float, default=0.25,
-                    help='gradient clipping')
-parser.add_argument('--clip_nonemb', action='store_true',
-                    help='only clip the gradient of non-embedding params')
-parser.add_argument('--max_step', type=int, default=100000,
-                    help='upper epoch limit')
-parser.add_argument('--batch_size', type=int, default=60,
-                    help='batch size')
-parser.add_argument('--batch_chunk', type=int, default=1,
-                    help='split batch into chunks to save memory')
-parser.add_argument('--tgt_len', type=int, default=70,
-                    help='number of tokens to predict')
-parser.add_argument('--eval_tgt_len', type=int, default=50,
-                    help='number of tokens to predict for evaluation')
-parser.add_argument('--ext_len', type=int, default=0,
-                    help='length of the extended context')
-parser.add_argument('--mem_len', type=int, default=0,
-                    help='length of the retained previous heads')
-parser.add_argument('--not_tied', action='store_true',
-                    help='do not tie the word embedding and softmax weights')
-parser.add_argument('--seed', type=int, default=1111,
-                    help='random seed')
-parser.add_argument('--cuda', action='store_true',
-                    help='use CUDA')
-parser.add_argument('--adaptive', action='store_true',
-                    help='use adaptive softmax')
-parser.add_argument('--div_val', type=int, default=1,
-                    help='divident value for adapative input and softmax')
-parser.add_argument('--pre_lnorm', action='store_true',
-                    help='apply LayerNorm to the input instead of the output')
-parser.add_argument('--varlen', action='store_true',
-                    help='use variable length')
-parser.add_argument('--multi_gpu', action='store_true',
-                    help='use multiple GPU')
-parser.add_argument('--log-interval', type=int, default=200,
-                    help='report interval')
-parser.add_argument('--eval-interval', type=int, default=4000,
-                    help='evaluation interval')
-parser.add_argument('--work_dir', default='LM-TFM', type=str,
-                    help='experiment directory.')
-parser.add_argument('--restart', action='store_true',
-                    help='restart training from the saved checkpoint')
-parser.add_argument('--restart_dir', type=str, default='',
-                    help='restart dir')
-parser.add_argument('--debug', action='store_true',
-                    help='run in debug mode (do not create exp dir)')
-parser.add_argument('--same_length', action='store_true',
-                    help='use the same attn length for all tokens')
-parser.add_argument('--attn_type', type=int, default=0,
-                    help='attention type. 0 for ours, 1 for Shaw et al,'
-                    '2 for Vaswani et al, 3 for Al Rfou et al.')
-parser.add_argument('--clamp_len', type=int, default=-1,
-                    help='use the same pos embeddings after clamp_len')
-parser.add_argument('--eta_min', type=float, default=0.0,
-                    help='min learning rate for cosine scheduler')
-parser.add_argument('--gpu0_bsz', type=int, default=-1,
-                    help='batch size on gpu 0')
-parser.add_argument('--max_eval_steps', type=int, default=-1,
-                    help='max eval steps')
-parser.add_argument('--sample_softmax', type=int, default=-1,
-                    help='number of samples in sampled softmax')
-parser.add_argument('--patience', type=int, default=0,
-                    help='patience')
-parser.add_argument('--finetune_v2', action='store_true',
-                    help='finetune v2')
-parser.add_argument('--finetune_v3', action='store_true',
-                    help='finetune v3')
-parser.add_argument('--fp16', action='store_true',
-                    help='Run in pseudo-fp16 mode (fp16 storage fp32 math).')
-parser.add_argument('--static-loss-scale', type=float, default=1,
-                    help='Static loss scale, positive power of 2 values can '
-                    'improve fp16 convergence.')
-parser.add_argument('--dynamic-loss-scale', action='store_true',
-                    help='Use dynamic loss scaling.  If supplied, this argument'
-                    ' supersedes --static-loss-scale.')
-args = parser.parse_args()
-args.tied = not args.not_tied
-
-if args.d_embed < 0:
-    args.d_embed = args.d_model
-
-assert args.ext_len >= 0, 'extended context length must be non-negative'
-assert args.batch_size % args.batch_chunk == 0
-
-args.work_dir = '{}-{}'.format(args.work_dir, args.dataset)
-args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S'))
-# logging = create_exp_dir(args.work_dir,
-#     scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug)
-
-# Set the random seed manually for reproducibility.
-np.random.seed(args.seed)
-torch.manual_seed(args.seed)
-if torch.cuda.is_available():
-    if not args.cuda:
-        print('WARNING: You have a CUDA device, so you should probably run with --cuda')
-    else:
-        torch.cuda.manual_seed_all(args.seed)
-
-# Validate `--fp16` option
-if args.fp16:
-    if not args.cuda:
-        print('WARNING: --fp16 requires --cuda, ignoring --fp16 option')
-        args.fp16 = False
-    else:
-        try:
-            from apex.fp16_utils import FP16_Optimizer
-        except ImportError:
-            print('WARNING: apex not installed, ignoring --fp16 option')
-            args.fp16 = False
-
-device = torch.device('cuda' if args.cuda else 'cpu')
-
-###############################################################################
-# Load data
-###############################################################################
-corpus = get_lm_corpus(args.data, args.dataset)
-ntokens = len(corpus.vocab)
-args.n_token = ntokens
-
-eval_batch_size = 10
-tr_iter = corpus.get_iterator('train', args.batch_size, args.tgt_len,
-    device=device, ext_len=args.ext_len)
-va_iter = corpus.get_iterator('valid', eval_batch_size, args.eval_tgt_len,
-    device=device, ext_len=args.ext_len)
-te_iter = corpus.get_iterator('test', eval_batch_size, args.eval_tgt_len,
-    device=device, ext_len=args.ext_len)
-
-# adaptive softmax / embedding
-cutoffs = []
-if args.adaptive:
-    assert args.dataset in ['wt103', 'lm1b']
-    if args.dataset == 'wt103':
-        cutoffs = [20000, 40000, 200000]
-        proj_share_all_but_first = True
-    elif args.dataset == 'lm1b':
-        cutoffs = [60000, 100000, 640000]
-        proj_share_all_but_first = False
-
-###############################################################################
-# Build the model
-###############################################################################
-def init_weight(weight):
-    if args.init == 'uniform':
-        nn.init.uniform_(weight, -args.init_range, args.init_range)
-    elif args.init == 'normal':
-        nn.init.normal_(weight, 0.0, args.init_std)
-
-def init_bias(bias):
-    nn.init.constant_(bias, 0.0)
-
-def weights_init(m):
-    classname = m.__class__.__name__
-    if classname.find('Linear') != -1:
-        if hasattr(m, 'weight') and m.weight is not None:
-            init_weight(m.weight)
-        if hasattr(m, 'bias') and m.bias is not None:
-            init_bias(m.bias)
-    elif classname.find('AdaptiveEmbedding') != -1:
-        if hasattr(m, 'emb_projs'):
-            for i in range(len(m.emb_projs)):
-                if m.emb_projs[i] is not None:
-                    nn.init.normal_(m.emb_projs[i], 0.0, args.proj_init_std)
-    elif classname.find('Embedding') != -1:
-        if hasattr(m, 'weight'):
-            init_weight(m.weight)
-    elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
-        if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
-            init_weight(m.cluster_weight)
-        if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
-            init_bias(m.cluster_bias)
-        if hasattr(m, 'out_projs'):
-            for i in range(len(m.out_projs)):
-                if m.out_projs[i] is not None:
-                    nn.init.normal_(m.out_projs[i], 0.0, args.proj_init_std)
-    elif classname.find('LayerNorm') != -1:
-        if hasattr(m, 'weight'):
-            nn.init.normal_(m.weight, 1.0, args.init_std)
-        if hasattr(m, 'bias') and m.bias is not None:
-            init_bias(m.bias)
-    elif classname.find('TransformerLM') != -1:
-        if hasattr(m, 'r_emb'):
-            init_weight(m.r_emb)
-        if hasattr(m, 'r_w_bias'):
-            init_weight(m.r_w_bias)
-        if hasattr(m, 'r_r_bias'):
-            init_weight(m.r_r_bias)
-        if hasattr(m, 'r_bias'):
-            init_bias(m.r_bias)
-
-def update_dropout(m):
-    classname = m.__class__.__name__
-    if classname.find('Dropout') != -1:
-        if hasattr(m, 'p'):
-            m.p = args.dropout
-
-def update_dropatt(m):
-    if hasattr(m, 'dropatt'):
-        m.dropatt.p = args.dropatt
-
-if args.restart:
-    with open(os.path.join(args.restart_dir, 'model.pt'), 'rb') as f:
-        model = torch.load(f)
-    if not args.fp16:
-        model = model.float()
-    model.apply(update_dropout)
-    model.apply(update_dropatt)
-else:
-    config = TransfoXLConfig(ntokens, n_layer=args.n_layer, n_head=args.n_head,
-        d_model=args.d_model, d_head=args.d_head, d_inner=args.d_inner,
-        dropout=args.dropout, dropatt=args.dropatt,
-        tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val,
-        proj_share_all_but_first=proj_share_all_but_first,
-        pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len,
-        ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs,
-        same_length=args.same_length, attn_type=args.attn_type,
-        clamp_len=args.clamp_len, sample_softmax=args.sample_softmax)
-    model = TransfoXLModel(config)
-    model.apply(weights_init)
-    model.word_emb.apply(weights_init) # ensure embedding init is not overridden by out_layer in case of weight sharing
-args.n_all_param = sum([p.nelement() for p in model.parameters()])
-args.n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()])
-
-if args.fp16:
-    model = model.half()
-
-if args.multi_gpu:
-    model = model.to(device)
-    if args.gpu0_bsz >= 0:
-        raise NotImplementedError
-        # para_model = BalancedDataParallel(args.gpu0_bsz // args.batch_chunk,
-        #                                   model, dim=1).to(device)
-    else:
-        para_model = nn.DataParallel(model, dim=1).to(device)
-else:
-    para_model = model.to(device)
-
-#### optimizer
-if args.optim.lower() == 'sgd':
-    if args.sample_softmax > 0:
-        dense_params, sparse_params = [], []
-        for param in model.parameters():
-            if param.size() == model.word_emb.weight.size():
-                sparse_params.append(param)
-            else:
-                dense_params.append(param)
-        optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2)
-        optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom)
-    else:
-        optimizer = optim.SGD(model.parameters(), lr=args.lr,
-            momentum=args.mom)
-elif args.optim.lower() == 'adam':
-    if args.sample_softmax > 0:
-        dense_params, sparse_params = [], []
-        for param in model.parameters():
-            if param.size() == model.word_emb.weight.size():
-                sparse_params.append(param)
-            else:
-                dense_params.append(param)
-        optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr)
-        optimizer = optim.Adam(dense_params, lr=args.lr)
-    else:
-        optimizer = optim.Adam(model.parameters(), lr=args.lr)
-elif args.optim.lower() == 'adagrad':
-    optimizer = optim.Adagrad(model.parameters(), lr=args.lr)
-
-#### scheduler
-if args.scheduler == 'cosine':
-    # here we do not set eta_min to lr_min to be backward compatible
-    # because in previous versions eta_min is default to 0
-    # rather than the default value of lr_min 1e-6
-    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
-        args.max_step, eta_min=args.eta_min) # should use eta_min arg
-    if args.sample_softmax > 0:
-        scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR(optimizer_sparse,
-            args.max_step, eta_min=args.eta_min) # should use eta_min arg
-elif args.scheduler == 'inv_sqrt':
-    # originally used for Transformer (in Attention is all you need)
-    def lr_lambda(step):
-        # return a multiplier instead of a learning rate
-        if step == 0 and args.warmup_step == 0:
-            return 1.
-        else:
-            return 1. / (step ** 0.5) if step > args.warmup_step \
-                   else step / (args.warmup_step ** 1.5)
-    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
-elif args.scheduler == 'dev_perf':
-    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
-        factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min)
-    if args.sample_softmax > 0:
-        scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau(optimizer_sparse,
-            factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min)
-elif args.scheduler == 'constant':
-    pass
-
-if args.cuda and args.fp16:
-    # If args.dynamic_loss_scale is False, static_loss_scale will be used.
-    # If args.dynamic_loss_scale is True, it will take precedence over static_loss_scale.
-    optimizer = FP16_Optimizer(optimizer,
-                               static_loss_scale = args.static_loss_scale,
-                               dynamic_loss_scale = args.dynamic_loss_scale,
-                               dynamic_loss_args = {'init_scale': 2 ** 16})
-
-if args.restart:
-    if os.path.exists(os.path.join(args.restart_dir, 'optimizer.pt')):
-        with open(os.path.join(args.restart_dir, 'optimizer.pt'), 'rb') as f:
-            opt_state_dict = torch.load(f)
-            optimizer.load_state_dict(opt_state_dict)
-    else:
-        print('Optimizer was not saved. Start from scratch.')
-
-logger.info('=' * 100)
-for k, v in args.__dict__.items():
-    logger.info('    - {} : {}'.format(k, v))
-logger.info('=' * 100)
-logger.info('#params = {}'.format(args.n_all_param))
-logger.info('#non emb params = {}'.format(args.n_nonemb_param))
-
-###############################################################################
-# Training code
-###############################################################################
-
-def evaluate(eval_iter):
-    # Turn on evaluation mode which disables dropout.
-    model.eval()
-
-    # If the model does not use memory at all, make the ext_len longer.
-    # Otherwise, make the mem_len longer and keep the ext_len the same.
-    if args.mem_len == 0:
-        model.reset_length(args.eval_tgt_len,
-            args.ext_len+args.tgt_len-args.eval_tgt_len, args.mem_len)
-    else:
-        model.reset_length(args.eval_tgt_len,
-            args.ext_len, args.mem_len+args.tgt_len-args.eval_tgt_len)
-
-    # Evaluation
-    total_len, total_loss = 0, 0.
-    with torch.no_grad():
-        mems = tuple()
-        for i, (data, target, seq_len) in enumerate(eval_iter):
-            if args.max_eval_steps > 0 and i >= args.max_eval_steps:
-                break
-            ret = model(data, target, *mems)
-            loss, mems = ret
-            loss = loss.mean()
-            total_loss += seq_len * loss.float().item()
-            total_len += seq_len
-
-    # Switch back to the training mode
-    model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
-    model.train()
-
-    return total_loss / total_len
-
-
-def train():
-    # Turn on training mode which enables dropout.
-    global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
-    model.train()
-    if args.batch_chunk > 1:
-        mems = [tuple() for _ in range(args.batch_chunk)]
-    else:
-        mems = tuple()
-    train_iter = tr_iter.get_varlen_iter() if args.varlen else tr_iter
-    for batch, (data, target, seq_len) in enumerate(train_iter):
-        model.zero_grad()
-        if args.batch_chunk > 1:
-            data_chunks = torch.chunk(data, args.batch_chunk, 1)
-            target_chunks = torch.chunk(target, args.batch_chunk, 1)
-            for i in range(args.batch_chunk):
-                data_i = data_chunks[i].contiguous()
-                target_i = target_chunks[i].contiguous()
-                ret = para_model(data_i, target_i, *mems[i])
-                loss, mems[i] = ret[0], ret[1:]
-                loss = loss.float().mean().type_as(loss) / args.batch_chunk
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-                train_loss += loss.float().item()
-        else:
-            ret = para_model(data, target, *mems)
-            loss, mems = ret[0], ret[1:]
-            loss = loss.float().mean().type_as(loss)
-            if args.fp16:
-                optimizer.backward(loss)
-            else:
-                loss.backward()
-            train_loss += loss.float().item()
-
-        if args.fp16:
-            optimizer.clip_master_grads(args.clip)
-        else:
-            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
-
-        optimizer.step()
-        if args.sample_softmax > 0:
-            optimizer_sparse.step()
-
-        # step-wise learning rate annealing
-        train_step += 1
-        if args.scheduler in ['cosine', 'constant', 'dev_perf']:
-            # linear warmup stage
-            if train_step < args.warmup_step:
-                curr_lr = args.lr * train_step / args.warmup_step
-                optimizer.param_groups[0]['lr'] = curr_lr
-                if args.sample_softmax > 0:
-                    optimizer_sparse.param_groups[0]['lr'] = curr_lr * 2
-            else:
-                if args.scheduler == 'cosine':
-                    scheduler.step(train_step)
-                    if args.sample_softmax > 0:
-                        scheduler_sparse.step(train_step)
-        elif args.scheduler == 'inv_sqrt':
-            scheduler.step(train_step)
-
-        if train_step % args.log_interval == 0:
-            cur_loss = train_loss / args.log_interval
-            elapsed = time.time() - log_start_time
-            log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \
-                      '| ms/batch {:5.2f} | loss {:5.2f}'.format(
-                epoch, train_step, batch+1, optimizer.param_groups[0]['lr'],
-                elapsed * 1000 / args.log_interval, cur_loss)
-            if args.dataset in ['enwik8', 'text8']:
-                log_str += ' | bpc {:9.5f}'.format(cur_loss / math.log(2))
-            else:
-                log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss))
-            logger.info(log_str)
-            train_loss = 0
-            log_start_time = time.time()
-
-        if train_step % args.eval_interval == 0:
-            val_loss = evaluate(va_iter)
-            logger.info('-' * 100)
-            log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \
-                      '| valid loss {:5.2f}'.format(
-                train_step // args.eval_interval, train_step,
-                (time.time() - eval_start_time), val_loss)
-            if args.dataset in ['enwik8', 'text8']:
-                log_str += ' | bpc {:9.5f}'.format(val_loss / math.log(2))
-            else:
-                log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss))
-            logger.info(log_str)
-            logger.info('-' * 100)
-            # Save the model if the validation loss is the best we've seen so far.
-            if not best_val_loss or val_loss < best_val_loss:
-                if not args.debug:
-                    with open(os.path.join(args.work_dir, 'model.pt'), 'wb') as f:
-                        torch.save(model, f)
-                    with open(os.path.join(args.work_dir, 'optimizer.pt'), 'wb') as f:
-                        torch.save(optimizer.state_dict(), f)
-                best_val_loss = val_loss
-
-            # dev-performance based learning rate annealing
-            if args.scheduler == 'dev_perf':
-                scheduler.step(val_loss)
-                if args.sample_softmax > 0:
-                    scheduler_sparse.step(val_loss)
-
-            eval_start_time = time.time()
-
-        if train_step == args.max_step:
-            break
-
-# Loop over epochs.
-train_step = 0
-train_loss = 0
-best_val_loss = None
-
-log_start_time = time.time()
-eval_start_time = time.time()
-
-# At any point you can hit Ctrl + C to break out of training early.
-try:
-    for epoch in itertools.count(start=1):
-        train()
-        if train_step == args.max_step:
-            logger.info('-' * 100)
-            logger.info('End of training')
-            break
-except KeyboardInterrupt:
-    logger.info('-' * 100)
-    logger.info('Exiting from training early')
-
-# Load the best saved model.
-with open(os.path.join(args.work_dir, 'model.pt'), 'rb') as f:
-    model = torch.load(f)
-para_model = model.to(device)
-
-# Run on test data.
-test_loss = evaluate(te_iter)
-logger.info('=' * 100)
-if args.dataset in ['enwik8', 'text8']:
-    logger.info('| End of training | test loss {:5.2f} | test bpc {:9.5f}'.format(
-        test_loss, test_loss / math.log(2)))
-else:
-    logger.info('| End of training | test loss {:5.2f} | test ppl {:9.3f}'.format(
-        test_loss, math.exp(test_loss)))
-logger.info('=' * 100)
diff --git a/examples/transfo_xl_eval.py b/examples/transfo_xl_eval.py
deleted file mode 100644
index 4f3606a97e..0000000000
--- a/examples/transfo_xl_eval.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Transformer XL model evaluation script.
-    Adapted from https://github.com/kimiyoung/transformer-xl.
-    In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
-
-    This script with default values evaluates a pretrained Transformer-XL on WikiText 103
-"""
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import argparse
-import logging
-import time
-import math
-
-import torch
-
-from pytorch_pretrained_bert import TransfoXLModel, TransfoXLCorpus
-
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
-parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
-                    help='pretrained model name')
-parser.add_argument('--split', type=str, default='test',
-                    choices=['all', 'valid', 'test'],
-                    help='which split to evaluate')
-parser.add_argument('--batch_size', type=int, default=10,
-                    help='batch size')
-parser.add_argument('--tgt_len', type=int, default=128,
-                    help='number of tokens to predict')
-parser.add_argument('--ext_len', type=int, default=0,
-                    help='length of the extended context')
-parser.add_argument('--mem_len', type=int, default=1600,
-                    help='length of the retained previous heads')
-parser.add_argument('--clamp_len', type=int, default=1000,
-                    help='max positional embedding index')
-parser.add_argument('--cuda', action='store_true',
-                    help='use CUDA')
-parser.add_argument('--work_dir', type=str, required=True,
-                    help='path to the work_dir')
-parser.add_argument('--no_log', action='store_true',
-                    help='do not log the eval result')
-parser.add_argument('--same_length', action='store_true',
-                    help='set same length attention with masking')
-args = parser.parse_args()
-assert args.ext_len >= 0, 'extended context length must be non-negative'
-
-device = torch.device("cuda" if args.cuda else "cpu")
-
-# Load a pre-processed dataset
-# You can also build the corpus yourself using TransfoXLCorpus methods
-# The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
-# and tokenizing the dataset
-# The pre-processed corpus is a convertion (using the conversion script )
-corpus = TransfoXLCorpus.from_pretrained(args.model_name)
-ntokens = len(corpus.vocab)
-
-va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
-    device=device, ext_len=args.ext_len)
-te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
-    device=device, ext_len=args.ext_len)
-
-# Load a pre-trained model
-model = TransfoXLModel.from_pretrained(args.model_name)
-model = model.to(device)
-
-logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
-       args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len))
-
-model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
-if args.clamp_len > 0:
-    model.clamp_len = args.clamp_len
-if args.same_length:
-    model.same_length = True
-
-###############################################################################
-# Evaluation code
-###############################################################################
-def evaluate(eval_iter):
-    # Turn on evaluation mode which disables dropout.
-    model.eval()
-    total_len, total_loss = 0, 0.
-    start_time = time.time()
-    with torch.no_grad():
-        mems = tuple()
-        for idx, (data, target, seq_len) in enumerate(eval_iter):
-            ret = model(data, target, *mems)
-            loss, mems = ret
-            loss = loss.mean()
-            total_loss += seq_len * loss.item()
-            total_len += seq_len
-        total_time = time.time() - start_time
-    logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format(
-            total_time, 1000 * total_time / (idx+1)))
-    return total_loss / total_len
-
-# Run on test data.
-if args.split == 'all':
-    test_loss = evaluate(te_iter)
-    valid_loss = evaluate(va_iter)
-elif args.split == 'valid':
-    valid_loss = evaluate(va_iter)
-    test_loss = None
-elif args.split == 'test':
-    test_loss = evaluate(te_iter)
-    valid_loss = None
-
-def format_log(loss, split):
-    log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
-        split, loss, math.exp(loss))
-    return log_str
-
-log_str = ''
-if valid_loss is not None:
-    log_str += format_log(valid_loss, 'valid')
-if test_loss is not None:
-    log_str += format_log(test_loss, 'test')
-
-logger.info('=' * 100)
-logger.info(log_str)
-logger.info('=' * 100)

From edcb56fd96958984fc4bce93e931d5bac61d41c4 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Feb 2019 09:54:49 +0100
Subject: [PATCH 47/82] more explicit variable name

---
 pytorch_pretrained_bert/modeling.py        | 12 ++++++------
 pytorch_pretrained_bert/modeling_openai.py | 12 ++++++------
 pytorch_pretrained_bert/tokenization.py    | 14 +++++++-------
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 0d68c2691c..e630765782 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -512,14 +512,14 @@ class BertPreTrainedModel(nn.Module):
             module.bias.data.zero_()
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None,
+    def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None,
                         from_tf=False, *inputs, **kwargs):
         """
         Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
 
         Params:
-            pretrained_model_name: either:
+            pretrained_model_name_or_path: either:
                 - a str with the name of a pre-trained model to load selected in the list of:
                     . `bert-base-uncased`
                     . `bert-large-uncased`
@@ -540,10 +540,10 @@ class BertPreTrainedModel(nn.Module):
             *inputs, **kwargs: additional input for the specific Bert class
                 (ex: num_labels for BertForSequenceClassification)
         """
-        if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
-            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
+        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
         else:
-            archive_file = pretrained_model_name
+            archive_file = pretrained_model_name_or_path
         # redirect to the cache, if necessary
         try:
             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
@@ -552,7 +552,7 @@ class BertPreTrainedModel(nn.Module):
                 "Model name '{}' was not found in model name list ({}). "
                 "We assumed '{}' was a path or url but couldn't find any file "
                 "associated to this path or url.".format(
-                    pretrained_model_name,
+                    pretrained_model_name_or_path,
                     ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
                     archive_file))
             return None
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 7e4cd63bba..ac2fb03910 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -418,14 +418,14 @@ class OpenAIGPTPreTrainedModel(nn.Module):
 
     @classmethod
     def from_pretrained(
-        cls, pretrained_model_name, num_special_tokens=None, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs
+        cls, pretrained_model_name_or_path, num_special_tokens=None, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs
     ):
         """
         Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
         Download and cache the pre-trained model file if needed.
 
         Params:
-            pretrained_model_name: either:
+            pretrained_model_name_or_path: either:
                 - a str with the name of a pre-trained model to load selected in the list of:
                     . `openai-gpt`
                 - a path or url to a pretrained model archive containing:
@@ -440,11 +440,11 @@ class OpenAIGPTPreTrainedModel(nn.Module):
             *inputs, **kwargs: additional input for the specific Bert class
                 (ex: num_labels for BertForSequenceClassification)
         """
-        if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
-            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
+        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
             config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
         else:
-            archive_file = pretrained_model_name
+            archive_file = pretrained_model_name_or_path
             config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
         # redirect to the cache, if necessary
         try:
@@ -455,7 +455,7 @@ class OpenAIGPTPreTrainedModel(nn.Module):
                 "Model name '{}' was not found in model name list ({}). "
                 "We assumed '{}' was a path or url but couldn't find files {} and {} "
                 "at this path or url.".format(
-                    pretrained_model_name, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
+                    pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
                     archive_file, config_file
                 )
             )
diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py
index 65c1b56d38..d64d3512f3 100644
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -116,15 +116,15 @@ class BertTokenizer(object):
         return tokens
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
         """
         Instantiate a PreTrainedBertModel from a pre-trained model file.
         Download and cache the pre-trained model file if needed.
         """
-        if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
         else:
-            vocab_file = pretrained_model_name
+            vocab_file = pretrained_model_name_or_path
         if os.path.isdir(vocab_file):
             vocab_file = os.path.join(vocab_file, VOCAB_NAME)
         # redirect to the cache, if necessary
@@ -135,7 +135,7 @@ class BertTokenizer(object):
                 "Model name '{}' was not found in model name list ({}). "
                 "We assumed '{}' was a path or url but couldn't find any file "
                 "associated to this path or url.".format(
-                    pretrained_model_name,
+                    pretrained_model_name_or_path,
                     ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
                     vocab_file))
             return None
@@ -144,10 +144,10 @@ class BertTokenizer(object):
         else:
             logger.info("loading vocabulary file {} from cache at {}".format(
                 vocab_file, resolved_vocab_file))
-        if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
             # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
             # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name]
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
             kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
         # Instantiate tokenizer.
         tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)

From 777459b471f4b6b97b633a4ca6de21d5dce96202 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Feb 2019 10:33:14 +0100
Subject: [PATCH 48/82] run openai example running

---
 examples/run_openai_gpt.py                    | 61 +++++++++++--------
 pytorch_pretrained_bert/modeling_openai.py    |  6 +-
 .../modeling_transfo_xl.py                    |  4 +-
 3 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index 4f76407958..44400702f0 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -52,30 +52,29 @@ def load_rocstories_dataset(dataset_path):
             output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
     return output
 
-def pre_process_datasets(encoded_datasets, max_len, start_token, delimiter_token, clf_token):
-    """ Pre-process datasets containing lists of
-        tuples(story, 1st continuation, 2nd continuation, label)
-        
-        In Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
-        input_ids[batch, alternative, :] = [start_token] + story[:max_len] + [delimiter_token] + cont1[:max_len] + [clf_token]
+def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
+    """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
+
+        To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
+        input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
     """
     tensor_datasets = []
     for dataset in encoded_datasets:
         n_batch = len(dataset)
-        input_ids = np.zeros((n_batch, 2, max_len), dtype=np.int32)
-        mc_token_mask = np.zeros((n_batch, 2, max_len), dtype=np.int32)
-        lm_labels = np.full((n_batch, 2, max_len), -1, dtype=np.float32)
-        mc_labels = np.zeros((n_batch,), dtype=np.float32)
+        input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
+        mc_token_mask = np.zeros((n_batch, 2, input_len), dtype=np.int64)
+        lm_labels = np.full((n_batch, 2, input_len), -1, dtype=np.int64)
+        mc_labels = np.zeros((n_batch,), dtype=np.int64)
         for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
-            with_cont1 = [start_token] + story[:max_len] + [delimiter_token] + cont1[:max_len] + [clf_token]
-            with_cont2 = [start_token] + story[:max_len] + [delimiter_token] + cont2[:max_len] + [clf_token]
+            with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
+            with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
             input_ids[i, 0, :len(with_cont1)] = with_cont1
             input_ids[i, 1, :len(with_cont2)] = with_cont2
             mc_token_mask[i, 0, len(with_cont1) - 1] = 1
             lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:]
             lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:]
             mc_labels[i] = mc_label
-        all_inputs = tuple(input_ids, mc_token_mask, lm_labels, mc_labels)
+        all_inputs = (input_ids, mc_token_mask, lm_labels, mc_labels)
         tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
     return tensor_datasets
 
@@ -83,6 +82,10 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('--model_name', type=str, default='openai-gpt',
                         help='pretrained model name')
+    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
     parser.add_argument('--train_dataset', type=str, default='cloze_test_val__spring2016 - cloze_test_ALL_val.tsv')
     parser.add_argument('--eval_dataset', type=str, default='test_spring2016.tsv')
     parser.add_argument('--seed', type=int, default=42)
@@ -92,7 +95,6 @@ def main():
     parser.add_argument('--max_grad_norm', type=int, default=1)
     parser.add_argument('--learning_rate', type=float, default=6.25e-5)
     parser.add_argument('--warmup_proportion', type=float, default=0.002)
-    parser.add_argument('--max_grad_norm', type=float, default=1)
     parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
     parser.add_argument('--weight_decay', type=float, default=0.01)
     parser.add_argument('--lm_coef', type=float, default=0.5)
@@ -109,6 +111,12 @@ def main():
     n_gpu = torch.cuda.device_count()
     logger.info("device: {}, n_gpu {}".format(device, n_gpu))
 
+    if not args.do_train and not args.do_eval:
+        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
     # Load tokenizer and model
     # This loading functions also add new tokens and embeddings called `special tokens`
     # These new embeddings will be fine-tuned on the RocStories dataset
@@ -118,23 +126,28 @@ def main():
     model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
 
     # Load and encode the datasets
+    def tokenize_and_encode(obj):
+        """ Tokenize and encode a nested object """
+        if isinstance(obj, str):
+            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
+        elif isinstance(obj, int):
+            return obj
+        return list(tokenize_and_encode(o) for o in obj)
+
     logger.info("Encoding dataset...")
     train_dataset = load_rocstories_dataset(args.train_dataset)
-    eval_datset = load_rocstories_dataset(args.eval_datset)
-    datasets = (train_dataset, eval_datset)
-    tokenized_datasets = tuple(list(list(tokenizer.tokenize(x) for x in instance)
-                                         for instance in dataset) for dataset in datasets)
-    encoded_datasets = tuple(list(list(tokenizer.convert_tokens_to_ids(x) for x in instance)
-                                       for instance in dataset) for dataset in tokenized_datasets)
+    eval_dataset = load_rocstories_dataset(args.eval_dataset)
+    datasets = (train_dataset, eval_dataset)
+    encoded_datasets = tokenize_and_encode(datasets)
 
     # Compute the mex input length for the Transformer
-    max_input_length = max(len(story) + max(len(cont1), len(cont2)) + 3  \
+    input_length = max(len(story) + max(len(cont1), len(cont2)) + 3  \
                            for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
-    max_input_length = min(max_input_length, model.config.n_positions)  # Max size of input for the pre-trained model
-    max_sub_part_length = max_input_length // 2 - 2
+    input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model
+    max_sub_part_length = input_length // 2 - 2
 
     # Prepare inputs tensors and dataloaders
-    tensor_datasets = pre_process_datasets(encoded_datasets, max_sub_part_length, *special_tokens_ids)
+    tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_sub_part_length, *special_tokens_ids)
     train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]
 
     train_data = TensorDataset(*train_tensor_dataset)
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index ac2fb03910..7100905a3a 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -38,9 +38,9 @@ from .modeling import BertLayerNorm as LayerNorm
 logger = logging.getLogger(__name__)
 
 PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
-PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-openai_gpt_config.json"}
+PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
 
-CONFIG_NAME = "openai_gpt_config.json"
+CONFIG_NAME = "config.json"
 WEIGHTS_NAME = "pytorch_model.bin"
 
 def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
@@ -444,7 +444,7 @@ class OpenAIGPTPreTrainedModel(nn.Module):
             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
             config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
         else:
-            archive_file = pretrained_model_name_or_path
+            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
             config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
         # redirect to the cache, if necessary
         try:
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 8f3ccb7283..4b1b0d157b 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -46,9 +46,9 @@ PRETRAINED_MODEL_ARCHIVE_MAP = {
     'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin",
 }
 PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-transfo_xl_config.json",
+    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
 }
-CONFIG_NAME = 'transfo_xl_config.json'
+CONFIG_NAME = 'config.json'
 WEIGHTS_NAME = 'pytorch_model.bin'
 TF_WEIGHTS_NAME = 'model.ckpt'
 

From 5ee4f1723434ff0453be7c22e39c949503f7f3ad Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Feb 2019 10:37:40 +0100
Subject: [PATCH 49/82] adding option to load on cpu

---
 pytorch_pretrained_bert/modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index e630765782..05f61b4c76 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -580,7 +580,7 @@ class BertPreTrainedModel(nn.Module):
         model = cls(config, *inputs, **kwargs)
         if state_dict is None and not from_tf:
             weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
-            state_dict = torch.load(weights_path)
+            state_dict = torch.load(weights_path, map_location='cpu' if not torch.cuda.is_available() else None)
         if tempdir:
             # Clean up temp dir
             shutil.rmtree(tempdir)

From 5adc20723bbff4fea628f526b256a60ccd98dbf5 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Feb 2019 11:03:59 +0100
Subject: [PATCH 50/82] add distant debugging

---
 examples/run_openai_gpt.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index 44400702f0..edee22af2b 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -99,9 +99,21 @@ def main():
     parser.add_argument('--weight_decay', type=float, default=0.01)
     parser.add_argument('--lm_coef', type=float, default=0.5)
     parser.add_argument('--n_valid', type=int, default=374)
+
+    parser.add_argument('--server_ip', type=str, default='')
+    parser.add_argument('--server_port', type=str, default='')
     args = parser.parse_args()
     print(args)
 
+    # Some distant debugging
+    # See https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+    import ptvsd
+    print(sys.argv)
+    print("Waiting for debugger attach")
+    ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+    ptvsd.wait_for_attach()
+
+
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)

From eccb2f01634c571304ccafa9672d8ad9cb8b7946 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Feb 2019 11:05:20 +0100
Subject: [PATCH 51/82] hot fix

---
 examples/run_openai_gpt.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index edee22af2b..9b6383fc08 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -108,7 +108,6 @@ def main():
     # Some distant debugging
     # See https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
     import ptvsd
-    print(sys.argv)
     print("Waiting for debugger attach")
     ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
     ptvsd.wait_for_attach()

From 5d7e84571250b95ba689aae46315e495ebdb918b Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Feb 2019 11:08:43 +0100
Subject: [PATCH 52/82] fix model on cuda

---
 examples/run_openai_gpt.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index 9b6383fc08..b3410cb425 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -135,6 +135,7 @@ def main():
     tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
     special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
     model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
+    model.to(device)
 
     # Load and encode the datasets
     def tokenize_and_encode(obj):

From 4bbb9f2d680f7f84ef53e18b1d0f448bcf94546f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Feb 2019 11:14:29 +0100
Subject: [PATCH 53/82] log loss - helpers

---
 examples/run_openai_gpt.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index b3410cb425..1c944d8285 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -100,18 +100,17 @@ def main():
     parser.add_argument('--lm_coef', type=float, default=0.5)
     parser.add_argument('--n_valid', type=int, default=374)
 
-    parser.add_argument('--server_ip', type=str, default='')
-    parser.add_argument('--server_port', type=str, default='')
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
     args = parser.parse_args()
     print(args)
 
-    # Some distant debugging
-    # See https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-    import ptvsd
-    print("Waiting for debugger attach")
-    ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-    ptvsd.wait_for_attach()
-
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
 
     random.seed(args.seed)
     np.random.seed(args.seed)
@@ -192,7 +191,8 @@ def main():
         for _ in trange(int(args.num_train_epochs), desc="Epoch"):
             tr_loss = 0
             nb_tr_examples, nb_tr_steps = 0, 0
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
+            tqdm_bar = tqdm(train_dataloader, desc="Training")
+            for step, batch in enumerate(tqdm_bar):
                 batch = tuple(t.to(device) for t in batch)
                 input_ids, mc_token_mask, lm_labels, mc_labels = batch
                 losses = model(input_ids, mc_token_mask, lm_labels, mc_labels)
@@ -202,6 +202,7 @@ def main():
                 tr_loss += loss.item()
                 nb_tr_examples += input_ids.size(0)
                 nb_tr_steps += 1
+                tqdm_bar.desc = "Training loss: {:e.2}".format(tr_loss/nb_tr_steps)
 
     # Save a trained model
     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

From 7b4b0cf966df3dc790dc8914afe4862195c87af7 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Feb 2019 11:16:29 +0100
Subject: [PATCH 54/82] logging

---
 examples/run_openai_gpt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index 1c944d8285..638d4bf123 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -202,7 +202,7 @@ def main():
                 tr_loss += loss.item()
                 nb_tr_examples += input_ids.size(0)
                 nb_tr_steps += 1
-                tqdm_bar.desc = "Training loss: {:e.2}".format(tr_loss/nb_tr_steps)
+                tqdm_bar.desc = "Training loss: {:.2e}".format(tr_loss/nb_tr_steps)
 
     # Save a trained model
     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

From 80607874c1f82e137ceb2cff3397c6a91d6aa963 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Feb 2019 21:49:05 +0100
Subject: [PATCH 55/82] fix layer norm epsilon in OpenAI GPT

---
 pytorch_pretrained_bert/modeling_openai.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 7100905a3a..e6f3fc4efe 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -141,6 +141,7 @@ class OpenAIGPTConfig(object):
         resid_pdrop=0.1,
         embd_pdrop=0.1,
         attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
         initializer_range=0.02,
     ):
         """Constructs OpenAIGPTConfig.
@@ -161,6 +162,7 @@ class OpenAIGPTConfig(object):
             attn_pdrop: The dropout ratio for the attention
                 probabilities.
             embd_pdrop: The dropout ratio for the embeddings.
+            layer_norm_epsilon: epsilon to use in the layer norm layers
             initializer_range: The sttdev of the truncated_normal_initializer for
                 initializing all weight matrices.
         """
@@ -182,6 +184,7 @@ class OpenAIGPTConfig(object):
             self.resid_pdrop = resid_pdrop
             self.embd_pdrop = embd_pdrop
             self.attn_pdrop = attn_pdrop
+            self.layer_norm_epsilon = layer_norm_epsilon
             self.initializer_range = initializer_range
         else:
             raise ValueError(
@@ -318,9 +321,9 @@ class Block(nn.Module):
         super(Block, self).__init__()
         nx = config.n_embd
         self.attn = Attention(nx, n_ctx, config, scale)
-        self.ln_1 = LayerNorm(nx)
+        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
         self.mlp = MLP(4 * nx, config)
-        self.ln_2 = LayerNorm(nx)
+        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
 
     def forward(self, x):
         a = self.attn(x)

From b80684b23f3c1fb3216cc4e9daed1c1917cce087 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Feb 2019 22:31:32 +0100
Subject: [PATCH 56/82] fixing run openai gpt example

---
 examples/run_openai_gpt.py | 48 +++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index 638d4bf123..6e0a0abf0c 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -31,7 +31,9 @@ import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 
-from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam
+from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam, cached_path
+
+ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
@@ -63,7 +65,7 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
         n_batch = len(dataset)
         input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
         mc_token_mask = np.zeros((n_batch, 2, input_len), dtype=np.int64)
-        lm_labels = np.full((n_batch, 2, input_len), -1, dtype=np.int64)
+        lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64)
         mc_labels = np.zeros((n_batch,), dtype=np.int64)
         for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
             with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
@@ -71,6 +73,7 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
             input_ids[i, 0, :len(with_cont1)] = with_cont1
             input_ids[i, 1, :len(with_cont2)] = with_cont2
             mc_token_mask[i, 0, len(with_cont1) - 1] = 1
+            mc_token_mask[i, 1, len(with_cont2) - 1] = 1
             lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:]
             lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:]
             mc_labels[i] = mc_label
@@ -86,8 +89,8 @@ def main():
     parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
     parser.add_argument("--output_dir", default=None, type=str, required=True,
                         help="The output directory where the model predictions and checkpoints will be written.")
-    parser.add_argument('--train_dataset', type=str, default='cloze_test_val__spring2016 - cloze_test_ALL_val.tsv')
-    parser.add_argument('--eval_dataset', type=str, default='test_spring2016.tsv')
+    parser.add_argument('--train_dataset', type=str, default='')
+    parser.add_argument('--eval_dataset', type=str, default='')
     parser.add_argument('--seed', type=int, default=42)
     parser.add_argument('--num_train_epochs', type=int, default=3)
     parser.add_argument('--train_batch_size', type=int, default=8)
@@ -97,7 +100,7 @@ def main():
     parser.add_argument('--warmup_proportion', type=float, default=0.002)
     parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
     parser.add_argument('--weight_decay', type=float, default=0.01)
-    parser.add_argument('--lm_coef', type=float, default=0.5)
+    parser.add_argument('--lm_coef', type=float, default=0.9)
     parser.add_argument('--n_valid', type=int, default=374)
 
     parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
@@ -137,6 +140,8 @@ def main():
     model.to(device)
 
     # Load and encode the datasets
+    if not args.train_dataset and not args.eval_dataset:
+        roc_stories = cached_path(ROCSTORIES_URL)
     def tokenize_and_encode(obj):
         """ Tokenize and encode a nested object """
         if isinstance(obj, str):
@@ -144,7 +149,6 @@ def main():
         elif isinstance(obj, int):
             return obj
         return list(tokenize_and_encode(o) for o in obj)
-
     logger.info("Encoding dataset...")
     train_dataset = load_rocstories_dataset(args.train_dataset)
     eval_dataset = load_rocstories_dataset(args.eval_dataset)
@@ -152,13 +156,13 @@ def main():
     encoded_datasets = tokenize_and_encode(datasets)
 
     # Compute the mex input length for the Transformer
-    input_length = max(len(story) + max(len(cont1), len(cont2)) + 3  \
+    max_length = model.config.n_positions // 2 - 2
+    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
                            for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
     input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model
-    max_sub_part_length = input_length // 2 - 2
 
     # Prepare inputs tensors and dataloaders
-    tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_sub_part_length, *special_tokens_ids)
+    tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
     train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]
 
     train_data = TensorDataset(*train_tensor_dataset)
@@ -176,7 +180,7 @@ def main():
         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
         ]
-    num_train_optimization_steps = len(train_data) // args.train_batch_size
+    num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
     optimizer = OpenAIAdam(optimizer_grouped_parameters,
                            lr=args.learning_rate,
                            warmup=args.warmup_proportion,
@@ -185,12 +189,11 @@ def main():
                            t_total=num_train_optimization_steps)
 
     if args.do_train:
-        nb_tr_steps = 0
-        tr_loss = 0
+        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
         model.train()
         for _ in trange(int(args.num_train_epochs), desc="Epoch"):
             tr_loss = 0
-            nb_tr_examples, nb_tr_steps = 0, 0
+            nb_tr_steps = 0
             tqdm_bar = tqdm(train_dataloader, desc="Training")
             for step, batch in enumerate(tqdm_bar):
                 batch = tuple(t.to(device) for t in batch)
@@ -200,21 +203,22 @@ def main():
                 loss.backward()
                 optimizer.step()
                 tr_loss += loss.item()
-                nb_tr_examples += input_ids.size(0)
+                exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
                 nb_tr_steps += 1
-                tqdm_bar.desc = "Training loss: {:.2e}".format(tr_loss/nb_tr_steps)
+                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0])
 
     # Save a trained model
-    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
     if args.do_train:
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
+        config = model.config
         torch.save(model_to_save.state_dict(), output_model_file)
 
-    # Load a trained model that you have fine-tuned
-    model_state_dict = torch.load(output_model_file)
-    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, state_dict=model_state_dict,
-                                                      num_special_tokens=len(special_tokens))
-    model.to(device)
+        # Load a trained model that you have fine-tuned
+        model_state_dict = torch.load(output_model_file)
+        model = OpenAIGPTDoubleHeadsModel(config)
+        model.load_state_dict(model_state_dict)
+        model.to(device)
 
     if args.do_eval:
         model.eval()

From 102c6b238c633aa2b07c37deb083802c6e6193bf Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Feb 2019 22:31:46 +0100
Subject: [PATCH 57/82] adding file cache to __init__

---
 pytorch_pretrained_bert/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index 761af86b6d..d15a926def 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -17,4 +17,4 @@ from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHe
 from .optimization import BertAdam
 from .optimization_openai import OpenAIAdam
 
-from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path

From dadd0c1b13da29a54f090ce90d0d7627bc9fa26b Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Feb 2019 22:31:57 +0100
Subject: [PATCH 58/82] updating __main__

---
 pytorch_pretrained_bert/__main__.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pytorch_pretrained_bert/__main__.py b/pytorch_pretrained_bert/__main__.py
index d3db22db60..a5a6557366 100644
--- a/pytorch_pretrained_bert/__main__.py
+++ b/pytorch_pretrained_bert/__main__.py
@@ -7,14 +7,14 @@ def main():
         "convert_transfo_xl_checkpoint"
     ]:
         print(
-        "Should be used as"
-        "`pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
-        "`pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]` or \n"
-        "`pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+        "Should be used as one of: \n"
+        ">> `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
+        ">> `pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]` or \n"
+        ">> `pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
     else:
         if sys.argv[1] == "convert_tf_checkpoint_to_pytorch":
             try:
-                import tensorflow as tf
+                from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
             except ImportError:
                 print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
@@ -42,7 +42,7 @@ def main():
                                                  PYTORCH_DUMP_OUTPUT)
         else:
             try:
-                import tensorflow as tf
+                from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
             except ImportError:
                 print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "

From 1756b5e9568c3f5d754d0fe6d7a39134e3753ab5 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Feb 2019 22:32:17 +0100
Subject: [PATCH 59/82] fix loading from Transfo-XL LM model

---
 .../modeling_transfo_xl.py                    | 50 ++++++++++---------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 4b1b0d157b..1077102af8 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -57,6 +57,33 @@ def build_tf_to_pytorch_map(model, config):
         This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible.
     """
     tf_to_pt_map = {}
+
+    if hasattr(model, 'transformer'):
+        # We are loading in a TransfoXLLMHeadModel => we will load also the Adaptive Softmax
+        tf_to_pt_map.update({
+            "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight,
+            "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias})
+        for i, (out_l, proj_l, tie_proj) in enumerate(zip(
+                                model.crit.out_layers,
+                                model.crit.out_projs,
+                                config.tie_projs)):
+            layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i
+            if config.tie_weight:
+                tf_to_pt_map.update({
+                    layer_str + 'b': out_l.bias})
+            else:
+                raise NotImplementedError
+                # I don't think this is implemented in the TF code
+                tf_to_pt_map.update({
+                    layer_str + 'lookup_table': out_l.weight,
+                    layer_str + 'b': out_l.bias})
+            if not tie_proj:
+                tf_to_pt_map.update({
+                    layer_str + 'proj': proj_l
+                    })
+        # Now load the rest of the transformer
+        model = model.transformer
+
     # Embeddings
     for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
         layer_str = "transformer/adaptive_embed/cutoff_%d/" % i
@@ -82,29 +109,6 @@ def build_tf_to_pytorch_map(model, config):
             layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
         })
 
-    # Adaptive Softmax
-    tf_to_pt_map.update({
-        "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight,
-        "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias})
-    for i, (out_l, proj_l, tie_proj) in enumerate(zip(
-                            model.crit.out_layers,
-                            model.crit.out_projs,
-                            config.tie_projs)):
-        layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i
-        if config.tie_weight:
-            tf_to_pt_map.update({
-                layer_str + 'b': out_l.bias})
-        else:
-            raise NotImplementedError
-            # I don't think this is implemented in the TF code
-            tf_to_pt_map.update({
-                layer_str + 'lookup_table': out_l.weight,
-                layer_str + 'b': out_l.bias})
-        if not tie_proj:
-            tf_to_pt_map.update({
-                layer_str + 'proj': proj_l
-                })
-
     # Relative positioning biases
     if config.untie_r:
         r_r_list = []

From 0c1a6f9b1d95e639724fb1be6300c7bbf9e242ac Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Feb 2019 22:32:25 +0100
Subject: [PATCH 60/82] update readme

---
 README.md | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4f387adc5a..3601ba5210 100644
--- a/README.md
+++ b/README.md
@@ -943,7 +943,20 @@ You can download Google's pre-trained models for the conversion [here](https://g
 
 ### OpenAI GPT
 
-Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoit save as the same format than OpenAI pretrained model (see [here](https://github.com/openai/finetune-transformer-lm))
+Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint save as the same format than OpenAI pretrained model (see [here](https://github.com/openai/finetune-transformer-lm))
+
+```shell
+export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
+
+pytorch_pretrained_bert convert_openai_checkpoint \
+  $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
+  $PYTORCH_DUMP_OUTPUT \
+  [OPENAI_GPT_CONFIG]
+```
+
+### Transformer-XL
+
+Here is an example of the conversion process for a pre-trained Transformer-XL model (see [here](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models))
 
 ```shell
 export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12

From cfcb95417ced7dd26031d5e0dc283bdbb667d087 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Feb 2019 23:08:53 +0100
Subject: [PATCH 61/82] fix hasattr

---
 pytorch_pretrained_bert/modeling_transfo_xl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 1077102af8..f3498944f5 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -970,7 +970,7 @@ class TransfoXLPreTrainedModel(nn.Module):
             raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
                                model.__class__.__name__, "\n\t".join(error_msgs)))
         # Make sure we are still sharing the input and output embeddings
-        if model.hasattr('tie_weights'):
+        if hasattr(model, 'tie_weights'):
             model.tie_weights()
         return model
 

From 43b9af0cac55c2bcde11ddea8658ada6f1bb1872 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 9 Feb 2019 16:12:19 +0100
Subject: [PATCH 62/82] mems initialized to None in run_transfo

---
 examples/run_transfo_xl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_transfo_xl.py b/examples/run_transfo_xl.py
index 1218a1f547..3a0a70e68e 100644
--- a/examples/run_transfo_xl.py
+++ b/examples/run_transfo_xl.py
@@ -100,7 +100,7 @@ def main():
         total_len, total_loss = 0, 0.
         start_time = time.time()
         with torch.no_grad():
-            mems = tuple()
+            mems = None
             for idx, (data, target, seq_len) in enumerate(eval_iter):
                 ret = model(data, target, *mems)
                 loss, mems = ret

From f4a07a392c769dbc66dada3d3b6ac9e1761f89da Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 9 Feb 2019 16:14:31 +0100
Subject: [PATCH 63/82] mems not splitted

---
 examples/run_transfo_xl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_transfo_xl.py b/examples/run_transfo_xl.py
index 3a0a70e68e..b8000a2080 100644
--- a/examples/run_transfo_xl.py
+++ b/examples/run_transfo_xl.py
@@ -102,7 +102,7 @@ def main():
         with torch.no_grad():
             mems = None
             for idx, (data, target, seq_len) in enumerate(eval_iter):
-                ret = model(data, target, *mems)
+                ret = model(data, target, mems)
                 loss, mems = ret
                 loss = loss.mean()
                 total_loss += seq_len * loss.item()

From 1320e4ec0c6fe36bf0f5f9869a85fca37514941d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 9 Feb 2019 16:58:53 +0100
Subject: [PATCH 64/82] mc_token_mask => mc_token_ids

---
 examples/run_openai_gpt.py                 | 18 +++++-----
 pytorch_pretrained_bert/modeling_openai.py | 41 ++++++++++------------
 tests/modeling_openai_test.py              | 16 ++++-----
 3 files changed, 35 insertions(+), 40 deletions(-)

diff --git a/examples/run_openai_gpt.py b/examples/run_openai_gpt.py
index 6e0a0abf0c..7a434ceaca 100644
--- a/examples/run_openai_gpt.py
+++ b/examples/run_openai_gpt.py
@@ -64,7 +64,7 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
     for dataset in encoded_datasets:
         n_batch = len(dataset)
         input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
-        mc_token_mask = np.zeros((n_batch, 2, input_len), dtype=np.int64)
+        mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
         lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64)
         mc_labels = np.zeros((n_batch,), dtype=np.int64)
         for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
@@ -72,12 +72,12 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
             with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
             input_ids[i, 0, :len(with_cont1)] = with_cont1
             input_ids[i, 1, :len(with_cont2)] = with_cont2
-            mc_token_mask[i, 0, len(with_cont1) - 1] = 1
-            mc_token_mask[i, 1, len(with_cont2) - 1] = 1
+            mc_token_ids[i, 0] = len(with_cont1) - 1
+            mc_token_ids[i, 1] = len(with_cont2) - 1
             lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:]
             lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:]
             mc_labels[i] = mc_label
-        all_inputs = (input_ids, mc_token_mask, lm_labels, mc_labels)
+        all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
         tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
     return tensor_datasets
 
@@ -197,8 +197,8 @@ def main():
             tqdm_bar = tqdm(train_dataloader, desc="Training")
             for step, batch in enumerate(tqdm_bar):
                 batch = tuple(t.to(device) for t in batch)
-                input_ids, mc_token_mask, lm_labels, mc_labels = batch
-                losses = model(input_ids, mc_token_mask, lm_labels, mc_labels)
+                input_ids, mc_token_ids, lm_labels, mc_labels = batch
+                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
                 loss = args.lm_coef * losses[0] + losses[1]
                 loss.backward()
                 optimizer.step()
@@ -226,10 +226,10 @@ def main():
         nb_eval_steps, nb_eval_examples = 0, 0
         for batch in tqdm(eval_dataloader, desc="Evaluating"):
             batch = tuple(t.to(device) for t in batch)
-            input_ids, mc_token_mask, lm_labels, mc_labels = batch
+            input_ids, mc_token_ids, lm_labels, mc_labels = batch
             with torch.no_grad():
-                _, mc_loss = model(input_ids, mc_token_mask, lm_labels, mc_labels)
-                _, mc_logits = model(input_ids, mc_token_mask)
+                _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels)
+                _, mc_logits = model(input_ids, mc_token_ids)
 
             mc_logits = mc_logits.detach().cpu().numpy()
             mc_labels = mc_labels.to('cpu').numpy()
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index e6f3fc4efe..60bf546c8c 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -366,23 +366,16 @@ class OpenAIGPTMultipleChoiceHead(nn.Module):
         nn.init.normal_(self.linear.weight, std=0.02)
         nn.init.normal_(self.linear.bias, 0)
 
-    def forward(self, hidden_states, mc_token_mask):
+    def forward(self, hidden_states, mc_token_ids):
         # Classification logits
-        # hidden_states = hidden_states.view(-1, self.n_embd)
-        # mc_token_mask = mc_token_mask.view(-1, 1).expand_as(hidden_states)
-        mc_token_mask = mc_token_mask.float()
-        multiple_choice_h = hidden_states * mc_token_mask.unsqueeze(-1)
-        multiple_choice_h = multiple_choice_h.sum(dim=-2)
-        # flat = x[..., 0].contiguous().view(-1)
-        # multiple_choice_h = multiple_choice_h[flat == self.multiple_choice_token, :]
-        # multiple_choice_h = multiple_choice_h.view(-1, x.size(1), self.n_embd, 1)
-        # # This double transposition is there to replicate the behavior
-        # # of the noise_shape argument in the tensorflow
-        # # implementation.  For more details, see
-        # # https://github.com/huggingface/pytorch-openai-transformer-lm/issues/11
-        # multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
-        # multiple_choice_h = multiple_choice_h.contiguous().view(-1, self.n_embd)
+        # hidden_state (bsz, num_choices, seq_length, hidden_size)
+        # mc_token_ids (bsz, num_choices)
+        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
+        # (bsz, num_choices, 1, hidden_size)
+        multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
+        # (bsz, num_choices, hidden_size)
         multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
+        # (bsz, num_choices)
         return multiple_choice_logits
 
 
@@ -727,7 +720,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
 
 
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
-    """OpenAI GPT model with a Language Modeling and a Multiple Choice heads ("Improving Language Understanding by Generative Pre-Training").
+    """OpenAI GPT model with a Language Modeling and a Multiple Choice head ("Improving Language Understanding by Generative Pre-Training").
 
     OpenAI GPT use a single embedding matrix to store the word and special embeddings.
     Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
@@ -750,8 +743,10 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         config: a OpenAIGPTConfig class instance with the configuration to build a new model
 
     Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
+        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
+            indices selected in the range [0, total_tokens_embeddings[
+        `mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
+            which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
         `position_ids`: an optional torch.LongTensor with the same shape as input_ids
             with the position indices (selected in the range [0, config.n_positions - 1[.
         `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
@@ -775,13 +770,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     Example usage:
     ```python
     # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    mc_token_mask = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
+    mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
 
     config = modeling_openai.OpenAIGPTConfig()
 
     model = modeling_openai.OpenAIGPTLMHeadModel(config)
-    lm_logits, multiple_choice_logits = model(input_ids, mc_token_mask)
+    lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
     ```
     """
 
@@ -799,10 +794,10 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         self.transformer.set_num_special_tokens(num_special_tokens)
         self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
 
-    def forward(self, input_ids, mc_token_mask, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None):
+    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None):
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
         lm_logits = self.lm_head(hidden_states)
-        mc_logits = self.multiple_choice_head(hidden_states, mc_token_mask)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
         if lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py
index 81892a981a..6baaaf677a 100644
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@@ -89,11 +89,11 @@ class OpenAIGPTModelTest(unittest.TestCase):
 
             mc_labels = None
             lm_labels = None
-            mc_token_mask = None
+            mc_token_ids = None
             if self.use_labels:
                 mc_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
                 lm_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
-                mc_token_mask = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], 2).float()
+                mc_token_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length).float()
 
             config = OpenAIGPTConfig(
                 vocab_size_or_config_json_file=self.vocab_size,
@@ -109,10 +109,10 @@ class OpenAIGPTModelTest(unittest.TestCase):
                 initializer_range=self.initializer_range)
 
             return (config, input_ids, token_type_ids, position_ids,
-                    mc_labels, lm_labels, mc_token_mask)
+                    mc_labels, lm_labels, mc_token_ids)
 
         def create_openai_model(self, config, input_ids, token_type_ids, position_ids,
-                                mc_labels, lm_labels, mc_token_mask):
+                                mc_labels, lm_labels, mc_token_ids):
             model = OpenAIGPTModel(config)
             model.eval()
             hidden_states = model(input_ids, position_ids, token_type_ids)
@@ -128,7 +128,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
 
 
         def create_openai_lm_head(self, config, input_ids, token_type_ids, position_ids,
-                                       mc_labels, lm_labels, mc_token_mask):
+                                       mc_labels, lm_labels, mc_token_ids):
             model = OpenAIGPTLMHeadModel(config)
             model.eval()
             loss = model(input_ids, position_ids, token_type_ids, lm_labels)
@@ -151,13 +151,13 @@ class OpenAIGPTModelTest(unittest.TestCase):
                 [])
 
         def create_openai_double_heads(self, config, input_ids, token_type_ids, position_ids,
-                                       mc_labels, lm_labels, mc_token_mask):
+                                       mc_labels, lm_labels, mc_token_ids):
             model = OpenAIGPTDoubleHeadsModel(config)
             model.eval()
-            loss = model(input_ids, mc_token_mask,
+            loss = model(input_ids, mc_token_ids,
                          lm_labels=lm_labels, mc_labels=mc_labels,
                          token_type_ids=token_type_ids, position_ids=position_ids)
-            lm_logits, mc_logits = model(input_ids, mc_token_mask, position_ids=position_ids, token_type_ids=token_type_ids)
+            lm_logits, mc_logits = model(input_ids, mc_token_ids, position_ids=position_ids, token_type_ids=token_type_ids)
             outputs = {
                 "loss": loss,
                 "lm_logits": lm_logits,

From 6cd769957e5eadd59963bb02efd3b746e8d120af Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 9 Feb 2019 16:59:17 +0100
Subject: [PATCH 65/82] update transfo xl example

---
 examples/run_transfo_xl.py                               | 6 +++---
 pytorch_pretrained_bert/modeling_transfo_xl_utilities.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/run_transfo_xl.py b/examples/run_transfo_xl.py
index b8000a2080..bf0d1a3d38 100644
--- a/examples/run_transfo_xl.py
+++ b/examples/run_transfo_xl.py
@@ -28,7 +28,7 @@ import math
 
 import torch
 
-from pytorch_pretrained_bert import TransfoXLModel, TransfoXLCorpus
+from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                     datefmt = '%m/%d/%Y %H:%M:%S',
@@ -79,7 +79,7 @@ def main():
         device=device, ext_len=args.ext_len)
 
     # Load a pre-trained model
-    model = TransfoXLModel.from_pretrained(args.model_name)
+    model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
     model = model.to(device)
 
     logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format(
@@ -139,4 +139,4 @@ def main():
     logger.info('=' * 100)
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
index 37c38d3776..0a65371c61 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
@@ -169,7 +169,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
 
                 if i == 0:
                     if target is not None:
-                        logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1)
+                        logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1)
                     else:
                         out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
                 else:

From 9f9909ea2f1ab36ed8f881011feb454b542590df Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 9 Feb 2019 16:59:21 +0100
Subject: [PATCH 66/82] update readme

---
 README.md | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 3601ba5210..df8fbccb2b 100644
--- a/README.md
+++ b/README.md
@@ -817,7 +817,10 @@ python run_lm_finetuning.py \
 
 ### OpenAI GPT and Transformer-XL: running the examples
 
-We provied two examples of scripts for OpenAI GPT and Transformer-XL based on (and extended from) the respective original implementations:
+We provide two examples of scripts for OpenAI GPT and Transformer-XL based on (and extended from) the respective original implementations:
+
+- fine-tuning OpenAI GPT on the ROCStories dataset
+- evaluating Transformer-XL on Wikitext 103
 
 #### Fine-tuning OpenAI GPT on the RocStories dataset
 
@@ -829,21 +832,28 @@ Before running this example you should download the
 ```shell
 export ROC_STORIES_DIR=/path/to/RocStories
 
-python train_openai_gpt.py \
-  --task_name MRPC \
+python run_openai_gpt.py \
+  --model_name openai-gpt \
   --do_train \
   --do_eval \
-  --do_lower_case \
-  --data_dir $GLUE_DIR/MRPC/ \
-  --bert_model bert-base-uncased \
-  --max_seq_length 128 \
-  --train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3.0 \
-  --output_dir /tmp/mrpc_output/
+  --train_dataset $ROC_STORIES_DIR/cloze_test_val__spring2016\ -\ cloze_test_ALL_val.csv \
+  --eval_dataset $ROC_STORIES_DIR/cloze_test_test__spring2016\ -\ cloze_test_ALL_test.csv \
+  --output_dir ../log \
+  --train_batch_size 16 \
 ```
 
-Our test ran on a few seeds with [the original implementation hyper-parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation results between 84% and 88%.
+This command run in about 10 min on a single K-80 an gives an evaluation accuracy of 86.42% (the authors reports a median accuracy with the TensorFlow code of 85.8% and the OpenAI GPT paper reports a best single run accuracy of 86.5%).
+
+#### Evaluating the pre-trained Transformer-XL on the WikiText 103 dataset
+
+This example code evaluate the pre-trained Transformer-XL on the WikiText 103 dataset.
+This command will download a pre-processed version of the WikiText 103 dataset in which the vocabulary has been computed.
+
+```shell
+python run_transfo_xl.py --work_dir ../log
+```
+
+This command run in about 10 min on a single K-80 an gives an evaluation accuracy of 86.42% (the authors reports a median accuracy with the TensorFlow code of 85.8% and the OpenAI GPT paper reports a best single run accuracy of 86.5%).
 
 ## Fine-tuning BERT-large on GPUs
 

From f0bf81e141e0063125d4fa24ca4a4065a4466c66 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 9 Feb 2019 17:05:23 +0100
Subject: [PATCH 67/82] back compatibility with Path inputs in fle_utils

---
 pytorch_pretrained_bert/file_utils.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py
index 0b5fc2c217..6954bec0e1 100644
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -12,6 +12,7 @@ import shutil
 import tempfile
 from functools import wraps
 from hashlib import sha256
+import sys
 from io import open
 
 import boto3
@@ -60,6 +61,8 @@ def filename_to_url(filename, cache_dir=None):
     """
     if cache_dir is None:
         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
 
     cache_path = os.path.join(cache_dir, filename)
     if not os.path.exists(cache_path):
@@ -86,6 +89,10 @@ def cached_path(url_or_filename, cache_dir=None):
     """
     if cache_dir is None:
         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
 
     parsed = urlparse(url_or_filename)
 
@@ -171,6 +178,8 @@ def get_from_cache(url, cache_dir=None):
     """
     if cache_dir is None:
         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
 
     if not os.path.exists(cache_dir):
         os.makedirs(cache_dir)

From 9bdcba53fd01309050c0bba7d8803f96c35b343c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 9 Feb 2019 17:07:12 +0100
Subject: [PATCH 68/82] fix tests

---
 tests/modeling_openai_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py
index 6baaaf677a..1cc8b7d5dc 100644
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@@ -93,7 +93,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
             if self.use_labels:
                 mc_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
                 lm_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
-                mc_token_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length).float()
+                mc_token_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length)
 
             config = OpenAIGPTConfig(
                 vocab_size_or_config_json_file=self.vocab_size,

From b514a60c360194b9f78f7dbee9dd8fbdf54ff688 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 11 Feb 2019 10:17:16 +0100
Subject: [PATCH 69/82] added tests for OpenAI GPT and Transformer-XL
 tokenizers

---
 README.md                                     |   8 +-
 .../tokenization_openai.py                    |   9 +-
 .../tokenization_transfo_xl.py                | 147 +++++++++++++++---
 tests/tokenization_openai_test.py             |  57 +++++++
 tests/tokenization_transfo_xl_test.py         |  90 +++++++++++
 5 files changed, 286 insertions(+), 25 deletions(-)
 create mode 100644 tests/tokenization_openai_test.py
 create mode 100644 tests/tokenization_transfo_xl_test.py

diff --git a/README.md b/README.md
index df8fbccb2b..607ab3b689 100644
--- a/README.md
+++ b/README.md
@@ -529,10 +529,10 @@ This model *outputs*:
 
 `OpenAIGPTDoubleHeadsModel` includes the `OpenAIGPTModel` Transformer followed by two heads:
 - a language modeling head with weights tied to the input embeddings (no additional parameters) and:
-- a multiple choice classifier (linear layer).
+- a multiple choice classifier (linear layer that take as input a hidden state in a sequence to compute a score, see details in paper).
 
 *Inputs* are the same as the inputs of the [`OpenAIGPTModel`](#-9.-`OpenAIGPTModel`) class plus a classification mask and two optional labels:
-- `multiple_choice_token_mask`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with a value of 1 were the last hidden state is (usually the [CLS] token) and 0 otherwise.
+- `multiple_choice_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token whose hidden state should be used as input for the multiple choice classifier (usually the [CLS] token for each choice).
 - `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size].
 - `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size] with indices selected in [0, ..., num_choices].
 
@@ -613,9 +613,9 @@ Please refer to the doc strings and code in [`tokenization_openai.py`](./pytorch
 
 #### `TransfoXLTokenizer`
 
-`TransfoXLTokenizer` perform word tokenization.
+`TransfoXLTokenizer` perform word tokenization. This tokenizer can be used for adaptive softmax and has utilities for counting tokens in a corpus to create a vocabulary ordered by toekn frequency (for adaptive softmax). See the adaptive softmax paper ([Efficient softmax approximation for GPUs](http://arxiv.org/abs/1609.04309)) for more details.
 
-Please refer to the doc strings and code in [`tokenization_transfo_xl.py`](./pytorch_pretrained_bert/tokenization_transfo_xl.py) for the details of the `TransfoXLTokenizer`.
+Please refer to the doc strings and code in [`tokenization_transfo_xl.py`](./pytorch_pretrained_bert/tokenization_transfo_xl.py) for the details of these additional methods in `TransfoXLTokenizer`.
 
 ### Optimizers:
 
diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py
index fcb8e13949..77ba922856 100644
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -70,7 +70,10 @@ def text_standardize(text):
 
 class OpenAIGPTTokenizer(object):
     """
-    mostly a wrapper for a public python bpe tokenizer
+    BPE tokenizer. Peculiarities:
+        - lower case all inputs
+        - uses SpaCy tokenizer
+        - special tokens: additional symbols (ex: "__classify__") to add to a vocabulary.
     """
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
@@ -150,7 +153,7 @@ class OpenAIGPTTokenizer(object):
         logger.info("Special tokens {}".format(self.special_tokens))
 
     def bpe(self, token):
-        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        word = tuple(token[:-1]) + (token[-1] + '</w>',)
         if token in self.cache:
             return self.cache[token]
         pairs = get_pairs(word)
@@ -159,7 +162,7 @@ class OpenAIGPTTokenizer(object):
             return token+'</w>'
 
         while True:
-            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
             if bigram not in self.bpe_ranks:
                 break
             first, second = bigram
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py
index 860b274f19..698deae21c 100644
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@@ -25,6 +25,7 @@ import os
 import sys
 from collections import Counter, OrderedDict
 from io import open
+import unicodedata
 
 import torch
 import numpy as np
@@ -89,8 +90,8 @@ class TransfoXLTokenizer(object):
             tokenizer.__dict__[key] = value
         return tokenizer
 
-    def __init__(self, special=[], min_freq=0, max_size=None, lower_case=True,
-                 delimiter=None, vocab_file=None):
+    def __init__(self, special=[], min_freq=0, max_size=None, lower_case=False,
+                 delimiter=None, vocab_file=None, never_split=("<unk>", "<eos>", "<formula>")):
         self.counter = Counter()
         self.special = special
         self.min_freq = min_freq
@@ -98,6 +99,7 @@ class TransfoXLTokenizer(object):
         self.lower_case = lower_case
         self.delimiter = delimiter
         self.vocab_file = vocab_file
+        self.never_split = never_split
 
     def count_file(self, path, verbose=False, add_eos=False):
         if verbose: print('counting file {} ...'.format(path))
@@ -132,7 +134,12 @@ class TransfoXLTokenizer(object):
             for line in f:
                 symb = line.strip().split()[0]
                 self.add_symbol(symb)
-        self.unk_idx = self.sym2idx['<UNK>']
+        if '<UNK>' in self.sym2idx:
+            self.unk_idx = self.sym2idx['<UNK>']
+        elif '<unk>' in self.sym2idx:
+            self.unk_idx = self.sym2idx['<unk>']
+        else:
+            raise ValueError('No <unkown> token in vocabulary')
 
     def build_vocab(self):
         if self.vocab_file:
@@ -198,7 +205,7 @@ class TransfoXLTokenizer(object):
             self.sym2idx[sym] = len(self.idx2sym) - 1
 
     def get_sym(self, idx):
-        assert 0 <= idx < len(self), 'Index {} out of range'.format(idx)
+        assert 0 <= idx < len(self), 'Index {} out of vocabulary range'.format(idx)
         return self.idx2sym[idx]
 
     def get_idx(self, sym):
@@ -206,9 +213,16 @@ class TransfoXLTokenizer(object):
             return self.sym2idx[sym]
         else:
             # print('encounter unk {}'.format(sym))
-            assert '<eos>' not in sym
-            assert hasattr(self, 'unk_idx')
-            return self.sym2idx.get(sym, self.unk_idx)
+            # assert '<eos>' not in sym
+            if hasattr(self, 'unk_idx'):
+                return self.sym2idx.get(sym, self.unk_idx)
+            # Backward compatibility with pre-trained models
+            elif '<unk>' in self.sym2idx:
+                return self.sym2idx['<unk>']
+            elif '<UNK>' in self.sym2idx:
+                return self.sym2idx['<UNK>']
+            else:
+                raise ValueError('Token not in vocabulary and no <unk> token in vocabulary for replacement')
 
     def convert_ids_to_tokens(self, indices):
         """Converts a sequence of indices in symbols using the vocab."""
@@ -231,24 +245,82 @@ class TransfoXLTokenizer(object):
     def __len__(self):
         return len(self.idx2sym)
 
-    def tokenize(self, line, add_eos=False, add_double_eos=False):
-        line = line.strip()
-        # convert to lower case
-        if self.lower_case:
-            line = line.lower()
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        if text in self.never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
 
-        # empty delimiter '' will evaluate False
+        return ["".join(x) for x in output]
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def whitespace_tokenize(self, text):
+        """Runs basic whitespace cleaning and splitting on a peice of text."""
+        text = text.strip()
+        if not text:
+            return []
         if self.delimiter == '':
-            symbols = line
+            tokens = text
         else:
-            symbols = line.split(self.delimiter)
+            tokens = text.split(self.delimiter)
+        return tokens
+
+    def tokenize(self, line, add_eos=False, add_double_eos=False):
+        line = self._clean_text(line)
+        line = line.strip()
+
+        symbols = self.whitespace_tokenize(line)
+
+        split_symbols = []
+        for symbol in symbols:
+            if self.lower_case and symbol not in self.never_split:
+                symbol = symbol.lower()
+                symbol = self._run_strip_accents(symbol)
+            split_symbols.extend(self._run_split_on_punc(symbol))
 
         if add_double_eos: # lm1b
-            return ['<S>'] + symbols + ['<S>']
+            return ['<S>'] + split_symbols + ['<S>']
         elif add_eos:
-            return symbols + ['<eos>']
+            return split_symbols + ['<eos>']
         else:
-            return symbols
+            return split_symbols
 
 
 class LMOrderedIterator(object):
@@ -556,3 +628,42 @@ def get_lm_corpus(datadir, dataset):
         torch.save(corpus, fn)
 
     return corpus
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
diff --git a/tests/tokenization_openai_test.py b/tests/tokenization_openai_test.py
new file mode 100644
index 0000000000..dadcd9699a
--- /dev/null
+++ b/tests/tokenization_openai_test.py
@@ -0,0 +1,57 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+import json
+from io import open
+
+from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
+
+
+class OpenAIGPTTokenizationTest(unittest.TestCase):
+
+    def test_full_tokenizer(self):
+        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
+                 "w</w>", "r</w>", "t</w>",
+                 "lo", "low", "er</w>",
+                 "low</w>", "lowest</w>", "newer</w>", "wider</w>"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
+        with open("/tmp/openai_tokenizer_vocab_test.json", "w", encoding='utf-8') as fp:
+            json.dump(vocab_tokens, fp)
+            vocab_file = fp.name
+        with open("/tmp/openai_tokenizer_merges_test.txt", "w", encoding='utf-8') as fp:
+            fp.write("\n".join(merges))
+            merges_file = fp.name
+
+        tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file, special_tokens=["<unk>"])
+        os.remove(vocab_file)
+        os.remove(merges_file)
+
+        text = "lower"
+        bpe_tokens = ["low", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + ["<unk>"]
+        input_bpe_tokens = [14, 15, 20]
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/tokenization_transfo_xl_test.py b/tests/tokenization_transfo_xl_test.py
new file mode 100644
index 0000000000..9ff04f5f34
--- /dev/null
+++ b/tests/tokenization_transfo_xl_test.py
@@ -0,0 +1,90 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+from io import open
+
+from pytorch_pretrained_bert.tokenization_transfo_xl import (TransfoXLTokenizer,
+                                                  _is_control, _is_punctuation,
+                                                  _is_whitespace)
+
+
+class TransfoXLTokenizationTest(unittest.TestCase):
+
+    def test_full_tokenizer(self):
+        vocab_tokens = [
+            "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un", "running", ","
+        ]
+        with open("/tmp/transfo_xl_tokenizer_test.txt", "w", encoding='utf-8') as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+            vocab_file = vocab_writer.name
+
+        tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True)
+        tokenizer.build_vocab()
+        os.remove(vocab_file)
+
+        tokens = tokenizer.tokenize(u"<unk> UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
+
+    def test_full_tokenizer_lower(self):
+        tokenizer = TransfoXLTokenizer(lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
+            ["hello", "!", "how", "are", "you", "?"])
+        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
+
+    def test_full_tokenizer_no_lower(self):
+        tokenizer = TransfoXLTokenizer(lower_case=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
+            ["HeLLo", "!", "how", "Are", "yoU", "?"])
+
+    def test_is_whitespace(self):
+        self.assertTrue(_is_whitespace(u" "))
+        self.assertTrue(_is_whitespace(u"\t"))
+        self.assertTrue(_is_whitespace(u"\r"))
+        self.assertTrue(_is_whitespace(u"\n"))
+        self.assertTrue(_is_whitespace(u"\u00A0"))
+
+        self.assertFalse(_is_whitespace(u"A"))
+        self.assertFalse(_is_whitespace(u"-"))
+
+    def test_is_control(self):
+        self.assertTrue(_is_control(u"\u0005"))
+
+        self.assertFalse(_is_control(u"A"))
+        self.assertFalse(_is_control(u" "))
+        self.assertFalse(_is_control(u"\t"))
+        self.assertFalse(_is_control(u"\r"))
+
+    def test_is_punctuation(self):
+        self.assertTrue(_is_punctuation(u"-"))
+        self.assertTrue(_is_punctuation(u"$"))
+        self.assertTrue(_is_punctuation(u"`"))
+        self.assertTrue(_is_punctuation(u"."))
+
+        self.assertFalse(_is_punctuation(u"A"))
+        self.assertFalse(_is_punctuation(u" "))
+
+
+if __name__ == '__main__':
+    unittest.main()

From 525eba68ab0d3c638a542ccc48150bbef0fb38af Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 11 Feb 2019 10:19:25 +0100
Subject: [PATCH 70/82] update Circle CI

---
 .circleci/config.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 0efb5f4b0b..9970258d01 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7,7 +7,8 @@ jobs:
         steps:
             - checkout
             - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest
+            - run: sudo pip install pytest ftfy spacy
+            - run: python -m spacy download en
             - run: python -m pytest -sv tests/
     build_py2:
         working_directory: ~/pytorch-pretrained-BERT
@@ -16,7 +17,8 @@ jobs:
         steps:
             - checkout
             - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest
+            - run: sudo pip install pytest ftfy spacy
+            - run: python -m spacy download en
             - run: python -m pytest -sv tests/
 workflows:
   version: 2

From 8197eb9f1049f25e650291ed6e141a9fa9c62df3 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 11 Feb 2019 10:22:10 +0100
Subject: [PATCH 71/82] update Circle CI config

---
 .circleci/config.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9970258d01..b57b478030 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -8,7 +8,7 @@ jobs:
             - checkout
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest ftfy spacy
-            - run: python -m spacy download en
+            - run: sudo python -m spacy download en
             - run: python -m pytest -sv tests/
     build_py2:
         working_directory: ~/pytorch-pretrained-BERT
@@ -17,8 +17,9 @@ jobs:
         steps:
             - checkout
             - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest ftfy spacy
-            - run: python -m spacy download en
+            - run: sudo pip install pytest spacy
+            - run: sudo pip install ftfy==4.4.3
+            - run: sudo python -m spacy download en
             - run: python -m pytest -sv tests/
 workflows:
   version: 2

From 2071a9b86e7bc533a52f4fa03f89f8adc2a25bc2 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 11 Feb 2019 10:35:36 +0100
Subject: [PATCH 72/82] fix python 2.7 imports

---
 README.md                             | 13 +++++++++++++
 pytorch_pretrained_bert/file_utils.py |  2 +-
 tests/tokenization_openai_test.py     |  2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 607ab3b689..4549c6ffd0 100644
--- a/README.md
+++ b/README.md
@@ -45,6 +45,12 @@ PyTorch pretrained bert can be installed by pip as follows:
 pip install pytorch-pretrained-bert
 ```
 
+If you want to use the tokenizer associated to the `OpenAI GPT` tokenizer, you will need to install `ftfy` (if you are using Python 2, version 4.4.3 is the last version working for you) and `SpaCy` :
+```bash
+pip install spacy ftfy==4.4.3
+python -m spacy download en
+```
+
 ### From source
 
 Clone the repository and run:
@@ -52,6 +58,13 @@ Clone the repository and run:
 pip install [--editable] .
 ```
 
+Here also, if you want to use `OpenAIGPT` tokenizer, you will need to install `ftfy` (limit to version 4.4.3 if you are using Python 2) and `SpaCy` :
+```bash
+pip install spacy ftfy==4.4.3
+python -m spacy download en
+```
+
+
 A series of tests is included in the [tests folder](https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/tests) and can be run using `pytest` (install pytest if needed: `pip install pytest`).
 
 You can run the tests with the command:
diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py
index 6954bec0e1..b475d450f6 100644
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -29,7 +29,7 @@ try:
     from pathlib import Path
     PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
                                                    Path.home() / '.pytorch_pretrained_bert'))
-except ImportError:
+except AttributeError:
     PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
                                               os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
 
diff --git a/tests/tokenization_openai_test.py b/tests/tokenization_openai_test.py
index dadcd9699a..8a67015ffd 100644
--- a/tests/tokenization_openai_test.py
+++ b/tests/tokenization_openai_test.py
@@ -32,7 +32,7 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
                  "low</w>", "lowest</w>", "newer</w>", "wider</w>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
-        with open("/tmp/openai_tokenizer_vocab_test.json", "w", encoding='utf-8') as fp:
+        with open("/tmp/openai_tokenizer_vocab_test.json", "wb") as fp:
             json.dump(vocab_tokens, fp)
             vocab_file = fp.name
         with open("/tmp/openai_tokenizer_merges_test.txt", "w", encoding='utf-8') as fp:

From 0a9860daa73d5c6c026b15e4e60ec0d77851726e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 11 Feb 2019 10:47:52 +0100
Subject: [PATCH 73/82] tests pass on python 2 and 3

---
 tests/tokenization_openai_test.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/tokenization_openai_test.py b/tests/tokenization_openai_test.py
index 8a67015ffd..6213eb1b03 100644
--- a/tests/tokenization_openai_test.py
+++ b/tests/tokenization_openai_test.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
-from io import open
 
 from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
 
@@ -32,10 +31,10 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
                  "low</w>", "lowest</w>", "newer</w>", "wider</w>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
-        with open("/tmp/openai_tokenizer_vocab_test.json", "wb") as fp:
+        with open("/tmp/openai_tokenizer_vocab_test.json", "w") as fp:
             json.dump(vocab_tokens, fp)
             vocab_file = fp.name
-        with open("/tmp/openai_tokenizer_merges_test.txt", "w", encoding='utf-8') as fp:
+        with open("/tmp/openai_tokenizer_merges_test.txt", "w") as fp:
             fp.write("\n".join(merges))
             merges_file = fp.name
 

From b31ba239132fc89a5ec076827abdbdc84d138c51 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 11 Feb 2019 12:15:43 +0100
Subject: [PATCH 74/82] cuda on in the examples by default

---
 README.md                  | 72 ++++++++++++++++++++++++++++----------
 examples/run_transfo_xl.py |  7 ++--
 2 files changed, 58 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 4549c6ffd0..6a1831ac98 100644
--- a/README.md
+++ b/README.md
@@ -187,8 +187,14 @@ Let's see how to use `BertModel` to get hidden states
 model = BertModel.from_pretrained('bert-base-uncased')
 model.eval()
 
+# If you have a GPU, put everything on cuda
+tokens_tensor = tokens_tensor.to('cuda')
+segments_tensors = segments_tensors.to('cuda')
+model.to('cuda')
+
 # Predict hidden states features for each layer
-encoded_layers, _ = model(tokens_tensor, segments_tensors)
+with torch.no_grad():
+    encoded_layers, _ = model(tokens_tensor, segments_tensors)
 # We have a hidden states for each of the 12 layers in model bert-base-uncased
 assert len(encoded_layers) == 12
 ```
@@ -200,8 +206,14 @@ And how to use `BertForMaskedLM`
 model = BertForMaskedLM.from_pretrained('bert-base-uncased')
 model.eval()
 
+# If you have a GPU, put everything on cuda
+tokens_tensor = tokens_tensor.to('cuda')
+segments_tensors = segments_tensors.to('cuda')
+model.to('cuda')
+
 # Predict all tokens
-predictions = model(tokens_tensor, segments_tensors)
+with torch.no_grad():
+    predictions = model(tokens_tensor, segments_tensors)
 
 # confirm we were able to predict 'henson'
 predicted_index = torch.argmax(predictions[0, masked_index]).item()
@@ -240,8 +252,13 @@ Let's see how to use `OpenAIGPTModel` to get hidden states
 model = OpenAIGPTModel.from_pretrained('openai-gpt')
 model.eval()
 
+# If you have a GPU, put everything on cuda
+tokens_tensor = tokens_tensor.to('cuda')
+model.to('cuda')
+
 # Predict hidden states features for each layer
-hidden_states = model(tokens_tensor)
+with torch.no_grad():
+    hidden_states = model(tokens_tensor)
 ```
 
 And how to use `OpenAIGPTLMHeadModel`
@@ -251,19 +268,25 @@ And how to use `OpenAIGPTLMHeadModel`
 model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
 model.eval()
 
+# If you have a GPU, put everything on cuda
+tokens_tensor = tokens_tensor.to('cuda')
+model.to('cuda')
+
 # Predict all tokens
-predictions = model(tokens_tensor)
+with torch.no_grad():
+    predictions = model(tokens_tensor)
 
 # get the predicted last token
-predicted_index = torch.argmax(predictions[0, masked_index]).item()
+predicted_index = torch.argmax(predictions[0, -1, :]).item()
 predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+assert predicted_token == '.</w>'
 ```
 
 ### Transformer-XL
 
-Here is a quick-start example using `OpenAIGPTTokenizer`, `OpenAIGPTModel` and `OpenAIGPTLMHeadModel` class with OpenAI's pre-trained  model. See the [doc section](#doc) below for all the details on these classes.
+Here is a quick-start example using `TransfoXLTokenizer`, `TransfoXLModel` and `TransfoXLModelLMHeadModel` class with the Transformer-XL model pre-trained on WikiText-103. See the [doc section](#doc) below for all the details on these classes.
 
-First let's prepare a tokenized input with `OpenAIGPTTokenizer`
+First let's prepare a tokenized input with `TransfoXLTokenizer`
 
 ```python
 import torch
@@ -294,27 +317,40 @@ Let's see how to use `TransfoXLModel` to get hidden states
 model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
 model.eval()
 
-# Predict hidden states features for each layer
-hidden_states_1, mems_1 = model(tokens_tensor_1)
-# We can re-use the memory cells in a subsequent call to attend a longer context
-hidden_states_2, mems_2 = model(tokens_tensor_2, mems_1)
+# If you have a GPU, put everything on cuda
+tokens_tensor_1 = tokens_tensor_1.to('cuda')
+tokens_tensor_2 = tokens_tensor_2.to('cuda')
+model.to('cuda')
+
+with torch.no_grad():
+    # Predict hidden states features for each layer
+    hidden_states_1, mems_1 = model(tokens_tensor_1)
+    # We can re-use the memory cells in a subsequent call to attend a longer context
+    hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
 ```
 
-And how to use `OpenAIGPTLMHeadModel`
+And how to use `TransfoXLLMHeadModel`
 
 ```python
 # Load pre-trained model (weights)
-model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
+model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
 model.eval()
 
-# Predict all tokens
-predictions_1, mems_1 = model(tokens_tensor_1)
-# We can re-use the memory cells in a subsequent call to attend a longer context
-predictions_2, mems_2 = model(tokens_tensor_2, mems_1)
+# If you have a GPU, put everything on cuda
+tokens_tensor_1 = tokens_tensor_1.to('cuda')
+tokens_tensor_2 = tokens_tensor_2.to('cuda')
+model.to('cuda')
+
+with torch.no_grad():
+    # Predict all tokens
+    predictions_1, mems_1 = model(tokens_tensor_1)
+    # We can re-use the memory cells in a subsequent call to attend a longer context
+    predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
 
 # get the predicted last token
-predicted_index = torch.argmax(predictions_1[0, masked_index]).item()
+predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
 predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+assert predicted_token == '.</w>'
 ```
 
 ## Doc
diff --git a/examples/run_transfo_xl.py b/examples/run_transfo_xl.py
index bf0d1a3d38..97c61777a4 100644
--- a/examples/run_transfo_xl.py
+++ b/examples/run_transfo_xl.py
@@ -52,8 +52,8 @@ def main():
                         help='length of the retained previous heads')
     parser.add_argument('--clamp_len', type=int, default=1000,
                         help='max positional embedding index')
-    parser.add_argument('--cuda', action='store_true',
-                        help='use CUDA')
+    parser.add_argument('--no_cuda', action='store_true',
+                        help='Do not use CUDA even though CUA is available')
     parser.add_argument('--work_dir', type=str, required=True,
                         help='path to the work_dir')
     parser.add_argument('--no_log', action='store_true',
@@ -63,7 +63,8 @@ def main():
     args = parser.parse_args()
     assert args.ext_len >= 0, 'extended context length must be non-negative'
 
-    device = torch.device("cuda" if args.cuda else "cpu")
+    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    logger.info("device: {}".format(device))
 
     # Load a pre-processed dataset
     # You can also build the corpus yourself using TransfoXLCorpus methods

From 32fea876bb9389a92791c8a633f811c297d4a77d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 11 Feb 2019 12:53:32 +0100
Subject: [PATCH 75/82] add distant debugging to run_transfo_xl

---
 README.md                  | 4 ++--
 examples/run_transfo_xl.py | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 6a1831ac98..e866506c6d 100644
--- a/README.md
+++ b/README.md
@@ -891,7 +891,7 @@ python run_openai_gpt.py \
   --train_batch_size 16 \
 ```
 
-This command run in about 10 min on a single K-80 an gives an evaluation accuracy of 86.42% (the authors reports a median accuracy with the TensorFlow code of 85.8% and the OpenAI GPT paper reports a best single run accuracy of 86.5%).
+This command runs in about 10 min on a single K-80 an gives an evaluation accuracy of about 86.4% (the authors report a median accuracy with the TensorFlow code of 85.8% and the OpenAI GPT paper reports a best single run accuracy of 86.5%).
 
 #### Evaluating the pre-trained Transformer-XL on the WikiText 103 dataset
 
@@ -902,7 +902,7 @@ This command will download a pre-processed version of the WikiText 103 dataset i
 python run_transfo_xl.py --work_dir ../log
 ```
 
-This command run in about 10 min on a single K-80 an gives an evaluation accuracy of 86.42% (the authors reports a median accuracy with the TensorFlow code of 85.8% and the OpenAI GPT paper reports a best single run accuracy of 86.5%).
+This command runs in about 1 min on a V100 and gives an evaluation perplexity of 18.22 on WikiText-103 (the authors report a perplexity of about 18.3 on this dataset with the TensorFlow code).
 
 ## Fine-tuning BERT-large on GPUs
 
diff --git a/examples/run_transfo_xl.py b/examples/run_transfo_xl.py
index 97c61777a4..06d37a719f 100644
--- a/examples/run_transfo_xl.py
+++ b/examples/run_transfo_xl.py
@@ -60,9 +60,18 @@ def main():
                         help='do not log the eval result')
     parser.add_argument('--same_length', action='store_true',
                         help='set same length attention with masking')
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
     args = parser.parse_args()
     assert args.ext_len >= 0, 'extended context length must be non-negative'
 
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
     device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
     logger.info("device: {}".format(device))
 

From 884ca81d879a51fa48756aa61c2ac25a2035ee3c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 11 Feb 2019 13:19:59 +0100
Subject: [PATCH 76/82] transposing the inputs of Transformer-XL to have a
 unified interface

---
 README.md                                     | 16 +++---
 .../modeling_transfo_xl.py                    | 53 ++++++++++++-------
 .../tokenization_transfo_xl.py                |  2 +-
 tests/modeling_transfo_xl_test.py             | 30 +++++------
 4 files changed, 57 insertions(+), 44 deletions(-)

diff --git a/README.md b/README.md
index e866506c6d..8265c5d246 100644
--- a/README.md
+++ b/README.md
@@ -603,25 +603,25 @@ Transformer XL use a relative positioning with sinusiodal patterns and adaptive
 
 This model takes as *inputs*:
 [`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py)
-- `input_ids`: a torch.LongTensor of shape [sequence_length, batch_size] with the token indices selected in the range [0, self.config.n_token[
-- `mems`: an optional memory of hidden states from previous forward passes as a list (num layers) of hidden states at the entry of each layer. Each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
+- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the token indices selected in the range [0, self.config.n_token[
+- `mems`: an optional memory of hidden states from previous forward passes as a list (num layers) of hidden states at the entry of each layer. Each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
 
 This model *outputs* a tuple of (last_hidden_state, new_mems)
-- `last_hidden_state`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [sequence_length, batch_size, self.config.d_model]
-- `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
+- `last_hidden_state`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, self.config.d_model]
+- `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
 
 #### 13. `TransfoXLLMHeadModel`
 
 `TransfoXLLMHeadModel` includes the `TransfoXLModel` Transformer followed by an (adaptive) softmax head with weights tied to the input embeddings.
 
 *Inputs* are the same as the inputs of the [`TransfoXLModel`](#-12.-`TransfoXLModel`) class plus optional labels:
-- `target`: an optional torch.LongTensor of shape [sequence_length, batch_size] with the target token indices selected in the range [0, self.config.n_token[
+- `target`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the target token indices selected in the range [0, self.config.n_token[
 
 *Outputs* a tuple of (last_hidden_state, new_mems)
 - `softmax_output`: output of the (adaptive) softmax:
-  - if target is None: Negative log likelihood of shape :: [len, bsz]
-  - else: log probabilities of tokens, shape :: [len, bsz, n_tokens]
-- `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
+  - if target is None: Negative log likelihood of shape [batch_size, sequence_length]
+  - else: log probabilities of tokens, shape [batch_size, sequence_length, n_tokens]
+- `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
 
 
 ### Tokenizers:
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index f3498944f5..714a9d9846 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -986,17 +986,19 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         config: a TransfoXLConfig class instance with the configuration to build a new model
 
     Inputs:
-        `input_ids`: a torch.LongTensor of shape [sequence_length, batch_size]
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
             with the token indices selected in the range [0, self.config.n_token[
         `mems`: optional memomry of hidden states from previous forward passes
             as a list (num layers) of hidden states at the entry of each layer
             each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
     Outputs:
         A tuple of (last_hidden_state, new_mems)
         `last_hidden_state`: the encoded-hidden-states at the top of the model
-            as a torch.FloatTensor of size [sequence_length, batch_size, self.config.d_model]
+            as a torch.FloatTensor of size [batch_size, sequence_length, self.config.d_model]
         `new_mems`: list (num layers) of updated mem states at the entry of each layer
             each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
 
     Example usage:
     ```python
@@ -1225,20 +1227,28 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
     def forward(self, input_ids, mems=None):
         """ Params:
-                input_ids :: [len, bsz]
+                input_ids :: [bsz, len]
                 mems :: optional mems from previous forwar passes (or init_mems)
                     list (num layers) of mem states at the entry of each layer
                         shape :: [self.config.mem_len, bsz, self.config.d_model]
+                    Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
             Returns:
                 tuple (last_hidden, new_mems) where:
                     new_mems: list (num layers) of mem states at the entry of each layer
                         shape :: [self.config.mem_len, bsz, self.config.d_model]
                     last_hidden: output of the last layer:
-                        shape :: [len, bsz, self.config.d_model]
+                        shape :: [bsz, len, self.config.d_model]
         """
+        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
+        # so we transpose here from shape [bsz, len] to shape [len, bsz]
+        input_ids = input_ids.transpose(0, 1).contiguous()
+
         if mems is None:
             mems = self.init_mems(input_ids)
         last_hidden, new_mems = self._forward(input_ids, mems=mems)
+
+        # We transpose back here to shape [bsz, len, hidden_dim]
+        last_hidden = last_hidden.transpose(0, 1).contiguous()
         return (last_hidden, new_mems)
 
 
@@ -1257,23 +1267,25 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         config: a TransfoXLConfig class instance with the configuration to build a new model
 
     Inputs:
-        `input_ids`: a torch.LongTensor of shape [sequence_length, batch_size]
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
             with the token indices selected in the range [0, self.config.n_token[
-        `target`: an optional torch.LongTensor of shape [sequence_length, batch_size]
+        `target`: an optional torch.LongTensor of shape [batch_size, sequence_length]
             with the target token indices selected in the range [0, self.config.n_token[
         `mems`: an optional memory of hidden states from previous forward passes
             as a list (num layers) of hidden states at the entry of each layer
             each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
 
     Outputs:
         A tuple of (last_hidden_state, new_mems)
         `softmax_output`: output of the (adaptive) softmax:
             if target is None:
-                Negative log likelihood of shape :: [len, bsz] 
+                Negative log likelihood of shape [batch_size, sequence_length] 
             else:
-                log probabilities of tokens, shape :: [len, bsz, n_tokens]
+                log probabilities of tokens, shape [batch_size, sequence_length, n_tokens]
         `new_mems`: list (num layers) of updated mem states at the entry of each layer
             each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
 
     Example usage:
     ```python
@@ -1287,7 +1299,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
     last_hidden_state, new_mems = model(input_ids)
 
     # Another time on input_ids_next using the memory:
-    last_hidden_state, new_mems = model(input_ids_next, new_mems)
+    last_hidden_state, new_mems = model(input_ids_next, mems=new_mems)
     ```
     """
     def __init__(self, config):
@@ -1331,33 +1343,34 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
 
     def forward(self, input_ids, target=None, mems=None):
         """ Params:
-                input_ids :: [len, bsz]
-                target :: [len, bsz]
+                input_ids :: [bsz, len]
+                target :: [bsz, len]
             Returns:
                 tuple(softmax_output, new_mems) where:
                     new_mems: list (num layers) of hidden states at the entry of each layer
-                        shape :: [mem_len, bsz, self.config.d_model]
+                        shape :: [mem_len, bsz, self.config.d_model] :: Warning: shapes are transposed here w. regards to input_ids
                     softmax_output: output of the (adaptive) softmax:
                         if target is None:
-                            Negative log likelihood of shape :: [len, bsz] 
+                            Negative log likelihood of shape :: [bsz, len] 
                         else:
-                            log probabilities of tokens, shape :: [len, bsz, n_tokens]
+                            log probabilities of tokens, shape :: [bsz, len, n_tokens]
         """
-        bsz = input_ids.size(1)
-        tgt_len = input_ids.size(0)
+        bsz = input_ids.size(0)
+        tgt_len = input_ids.size(1)
 
         last_hidden, new_mems = self.transformer(input_ids, mems)
 
-        pred_hid = last_hidden[-tgt_len:]
+        pred_hid = last_hidden[:, -tgt_len:]
         if self.sample_softmax > 0 and self.training:
             assert self.config.tie_weight
             logit = sample_logits(self.transformer.word_emb, self.out_layer.bias, target, pred_hid, self.sampler)
-            loss = -F.log_softmax(logit, -1)[:, :, 0]
+            softmax_output = -F.log_softmax(logit, -1)[:, :, 0]
         else:
             softmax_output = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target)
             if target is None:
-                softmax_output = softmax_output.view(tgt_len, bsz, -1)
+                softmax_output = softmax_output.view(bsz, tgt_len, -1)
             else:
-                softmax_output = softmax_output.view(tgt_len, bsz)
+                softmax_output = softmax_output.view(bsz, tgt_len)
 
+        # We transpose back
         return (softmax_output, new_mems)
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py
index 698deae21c..585a815923 100644
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@@ -507,7 +507,7 @@ class TransfoXLCorpus(object):
             resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir)
         except EnvironmentError:
             logger.error(
-                "Model name '{}' was not found in model name list ({}). "
+                "Corpus '{}' was not found in corpus list ({}). "
                 "We assumed '{}' was a path or url but couldn't find files {} "
                 "at this path or url.".format(
                     pretrained_model_name_or_path,
diff --git a/tests/modeling_transfo_xl_test.py b/tests/modeling_transfo_xl_test.py
index 0bc16daf4c..291d5d9d2a 100644
--- a/tests/modeling_transfo_xl_test.py
+++ b/tests/modeling_transfo_xl_test.py
@@ -67,12 +67,12 @@ class TransfoXLModelTest(unittest.TestCase):
             self.seed = seed
 
         def prepare_config_and_inputs(self):
-            input_ids_1 = TransfoXLModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
-            input_ids_2 = TransfoXLModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
+            input_ids_1 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_ids_2 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             lm_labels = None
             if self.use_labels:
-                lm_labels = TransfoXLModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
+                lm_labels = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             config = TransfoXLConfig(
                 vocab_size_or_config_json_file=self.vocab_size,
@@ -110,13 +110,13 @@ class TransfoXLModelTest(unittest.TestCase):
         def check_transfo_xl_model_output(self, result):
             self.parent.assertListEqual(
                 list(result["hidden_states_1"].size()),
-                [self.seq_length, self.batch_size, self.d_model])
+                [self.batch_size, self.seq_length, self.d_model])
+            self.parent.assertListEqual(
+                list(result["hidden_states_2"].size()),
+                [self.batch_size, self.seq_length, self.d_model])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1"]),
                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
-            self.parent.assertListEqual(
-                list(result["hidden_states_2"].size()),
-                [self.seq_length, self.batch_size, self.d_model])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_2"]),
                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
@@ -147,13 +147,13 @@ class TransfoXLModelTest(unittest.TestCase):
         def check_transfo_xl_lm_head_output(self, result):
             self.parent.assertListEqual(
                 list(result["loss_1"].size()),
-                [self.seq_length, self.batch_size])
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["lm_logits_1"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1a"]),
                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
-            self.parent.assertListEqual(
-                list(result["lm_logits_1"].size()),
-                [self.seq_length, self.batch_size, self.vocab_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1b"]),
                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
@@ -163,13 +163,13 @@ class TransfoXLModelTest(unittest.TestCase):
 
             self.parent.assertListEqual(
                 list(result["loss_2"].size()),
-                [self.seq_length, self.batch_size])
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["lm_logits_2"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_2a"]),
                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
-            self.parent.assertListEqual(
-                list(result["lm_logits_2"].size()),
-                [self.seq_length, self.batch_size, self.vocab_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_2b"]),
                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)

From e8fe6b7140a3b48c72a5ef528099d2518856124d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 11 Feb 2019 13:30:04 +0100
Subject: [PATCH 77/82] adapting transfo tokenizer to transposed inputs

---
 pytorch_pretrained_bert/tokenization_transfo_xl.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py
index 585a815923..3f74726f6f 100644
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@@ -356,7 +356,10 @@ class LMOrderedIterator(object):
         data = self.data[beg_idx:end_idx]
         target = self.data[i+1:i+1+seq_len]
 
-        return data, target, seq_len
+        data_out = data.transpose(0, 1).contiguous().to(self.device)
+        target_out = target.transpose(0, 1).contiguous().to(self.device)
+
+        return data_out, target_out, seq_len
 
     def get_fixlen_iter(self, start=0):
         for i in range(start, self.data.size(0) - 1, self.bptt):
@@ -440,10 +443,10 @@ class LMShuffledIterator(object):
             if not valid_batch:
                 return
 
-            data = data.to(self.device)
-            target = target.to(self.device)
+            data_out = data.transpose(0, 1).contiguous().to(self.device)
+            target_out = target.transpose(0, 1).contiguous().to(self.device)
 
-            yield data, target, self.bptt
+            yield data_out, target_out, self.bptt
 
             n_retain = min(data.size(0), self.ext_len)
             if n_retain > 0:

From 81c7e3ec9f26e774902276769d32140bc699c631 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 11 Feb 2019 13:37:12 +0100
Subject: [PATCH 78/82] fix typo in readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8265c5d246..4bce27db47 100644
--- a/README.md
+++ b/README.md
@@ -350,7 +350,7 @@ with torch.no_grad():
 # get the predicted last token
 predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
 predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-assert predicted_token == '.</w>'
+assert predicted_token == 'who'
 ```
 
 ## Doc

From eebc8abbe2e563fc334fd1dadfd31819fd1286b6 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 11 Feb 2019 14:04:19 +0100
Subject: [PATCH 79/82] clarify and unify model saving logic in examples

---
 README.md                  |  3 ++-
 examples/run_classifier.py | 40 +++++++++++++++++++++++++++-----------
 examples/run_squad.py      | 19 +++++++++++-------
 examples/run_swag.py       | 25 +++++++++++++++---------
 4 files changed, 59 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index 4bce27db47..8efd64e5d2 100644
--- a/README.md
+++ b/README.md
@@ -779,7 +779,8 @@ python run_classifier.py \
   --train_batch_size 32 \
   --learning_rate 2e-5 \
   --num_train_epochs 3.0 \
-  --output_dir /tmp/mrpc_output/
+  --output_dir /tmp/mrpc_output/ \
+  --fp16
 ```
 
 #### SQuAD
diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index a30d7982b0..83f0683a48 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -23,7 +23,6 @@ import logging
 import os
 import random
 import sys
-from io import open
 
 import numpy as np
 import torch
@@ -33,7 +32,7 @@ from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
 from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
-from pytorch_pretrained_bert.modeling import BertForSequenceClassification
+from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
 
@@ -92,7 +91,7 @@ class DataProcessor(object):
     @classmethod
     def _read_tsv(cls, input_file, quotechar=None):
         """Reads a tab separated value file."""
-        with open(input_file, "rb") as f:
+        with open(input_file, "r") as f:
             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
             lines = []
             for line in reader:
@@ -324,6 +323,10 @@ def main():
                         help="The output directory where the model predictions and checkpoints will be written.")
 
     ## Other parameters
+    parser.add_argument("--cache_dir",
+                        default="",
+                        type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
     parser.add_argument("--max_seq_length",
                         default=128,
                         type=int,
@@ -383,9 +386,17 @@ def main():
                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                              "0 (default value): dynamic loss scaling.\n"
                              "Positive power of 2: static loss scaling value.\n")
-
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
     args = parser.parse_args()
 
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
     processors = {
         "cola": ColaProcessor,
         "mnli": MnliProcessor,
@@ -451,8 +462,9 @@ def main():
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 
     # Prepare model
+    cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank))
     model = BertForSequenceClassification.from_pretrained(args.bert_model,
-              cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)),
+              cache_dir=cache_dir,
               num_labels = num_labels)
     if args.fp16:
         model.half()
@@ -549,15 +561,21 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
 
-    # Save a trained model
-    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
     if args.do_train:
+        # Save a trained model and the associated configuration
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
         torch.save(model_to_save.state_dict(), output_model_file)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+        with open(output_config_file, 'w') as f:
+            f.write(model_to_save.config.to_json_string())
 
-    # Load a trained model that you have fine-tuned
-    model_state_dict = torch.load(output_model_file)
-    model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict, num_labels=num_labels)
+        # Load a trained model and config that you have fine-tuned
+        config = BertConfig(output_config_file)
+        model = BertForSequenceClassification(config, num_labels=num_labels)
+        model.load_state_dict(torch.load(output_model_file))
+    else:
+        model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
     model.to(device)
 
     if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 9c2035701d..1d7c49c326 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -35,7 +35,7 @@ from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
 from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
-from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
+from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig, WEIGHTS_NAME, CONFIG_NAME
 from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear
 from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
                                                   BertTokenizer,
@@ -1001,14 +1001,19 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
 
-    # Save a trained model
-    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
     if args.do_train:
+        # Save a trained model and the associated configuration
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
         torch.save(model_to_save.state_dict(), output_model_file)
-        # Load a trained model that you have fine-tuned
-        model_state_dict = torch.load(output_model_file)
-        model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+        with open(output_config_file, 'w') as f:
+            f.write(model_to_save.config.to_json_string())
+
+        # Load a trained model and config that you have fine-tuned
+        config = BertConfig(output_config_file)
+        model = BertForQuestionAnswering(config)
+        model.load_state_dict(torch.load(output_model_file))
     else:
         model = BertForQuestionAnswering.from_pretrained(args.bert_model)
 
diff --git a/examples/run_swag.py b/examples/run_swag.py
index 52bcdcbd31..3ecea63046 100644
--- a/examples/run_swag.py
+++ b/examples/run_swag.py
@@ -469,18 +469,25 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
 
-    # Save a trained model
-    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
-    torch.save(model_to_save.state_dict(), output_model_file)
 
-    # Load a trained model that you have fine-tuned
-    model_state_dict = torch.load(output_model_file)
-    model = BertForMultipleChoice.from_pretrained(args.bert_model,
-        state_dict=model_state_dict,
-        num_choices=4)
+    if args.do_train:
+        # Save a trained model and the associated configuration
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        torch.save(model_to_save.state_dict(), output_model_file)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+        with open(output_config_file, 'w') as f:
+            f.write(model_to_save.config.to_json_string())
+
+        # Load a trained model and config that you have fine-tuned
+        config = BertConfig(output_config_file)
+        model = BertForMultipleChoice(config, num_choices=4)
+        model.load_state_dict(torch.load(output_model_file))
+    else:
+        model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
     model.to(device)
 
+
     if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True)
         eval_features = convert_examples_to_features(

From af62cc5f20da128980639f31a54e68bff399a11c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 11 Feb 2019 14:06:32 +0100
Subject: [PATCH 80/82] fix run_squad example

---
 examples/run_squad.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 1d7c49c326..0e9aec81a1 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -881,7 +881,7 @@ def main():
         train_examples = read_squad_examples(
             input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
         num_train_optimization_steps = int(
-            len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
+            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
         if args.local_rank != -1:
             num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
 

From d38caba1690c82943ab312e7b783784726f282d9 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 11 Feb 2019 14:10:27 +0100
Subject: [PATCH 81/82] typo in run_squad

---
 examples/run_squad.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 0e9aec81a1..e8ed71cd55 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -917,7 +917,7 @@ def main():
 
     if args.fp16:
         try:
-            from apex.optimizer import FP16_Optimizer
+            from apex.optimizers import FP16_Optimizer
             from apex.optimizers import FusedAdam
         except ImportError:
             raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

From 1e71f11dec30ded5d173d286a37c62d9000975e3 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 11 Feb 2019 14:16:27 +0100
Subject: [PATCH 82/82] Release: 0.5.0

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8efd64e5d2..a0bbead609 100644
--- a/README.md
+++ b/README.md
@@ -16,10 +16,10 @@ Here are some information on these models:
 This PyTorch implementation of BERT is provided with [Google's pre-trained models](https://github.com/google-research/bert), examples, notebooks and a command-line interface to load any pre-trained TensorFlow checkpoint for BERT is also provided.
 
 **OpenAI GPT** was released together with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-This PyTorch implementation of OpenAI GPT is provided with [OpenAI's pre-trained model](https://github.com/openai/finetune-transformer-lm) and a command-line interface that was used to convert the pre-trained NumPy checkpoint in PyTorch.
+This PyTorch implementation of OpenAI GPT is an adaptation of the [PyTorch implementation by HuggingFace](https://github.com/huggingface/pytorch-openai-transformer-lm) and is provided with [OpenAI's pre-trained model](https://github.com/openai/finetune-transformer-lm) and a command-line interface that was used to convert the pre-trained NumPy checkpoint in PyTorch.
 
 **Google/CMU's Transformer-XL** was released together with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](http://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-This PyTorch implementation of Transformer-XL is an adaptation of the original [PyTorch implementation](https://github.com/kimiyoung/transformer-xl) which has been slightly modifier to match the performances of the TensforFlow implementation and allow to re-use the pretrained weights. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
+This PyTorch implementation of Transformer-XL is an adaptation of the original [PyTorch implementation](https://github.com/kimiyoung/transformer-xl) which has been slightly modified to match the performances of the TensforFlow implementation and allow to re-use the pretrained weights. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
 
 ## Content