From 98c96fb1a71d2b4ef026ddf55f5700e79a9ed482 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 29 Jan 2019 10:31:42 +0100
Subject: [PATCH] splitting position and tokens embeddings in OpenAI GPT -
 updating tf imports - tests

---
 pytorch_pretrained_bert/__main__.py           |  4 +-
 .../convert_openai_checkpoint_to_pytorch.py   |  9 +--
 .../convert_tf_checkpoint_to_pytorch.py       |  2 +-
 pytorch_pretrained_bert/modeling.py           |  8 +++
 pytorch_pretrained_bert/modeling_openai.py    | 65 +++++++++++--------
 .../modeling_transfo_xl.py                    |  7 ++
 tests/modeling_openai_test.py                 | 15 ++---
 7 files changed, 66 insertions(+), 44 deletions(-)

diff --git a/pytorch_pretrained_bert/__main__.py b/pytorch_pretrained_bert/__main__.py
index 1869ff2ee2..3adb4ac36a 100644
--- a/pytorch_pretrained_bert/__main__.py
+++ b/pytorch_pretrained_bert/__main__.py
@@ -14,7 +14,7 @@ def main():
     else:
         if sys.argv[1] == "convert_tf_checkpoint_to_pytorch":
             try:
-                from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+                import tensorflow as tf
             except ModuleNotFoundError:
                 print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
@@ -42,7 +42,7 @@ def main():
                                                  PYTORCH_DUMP_OUTPUT)
         else:
             try:
-                from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
+                import tensorflow as tf
             except ModuleNotFoundError:
                 print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
                     "In that case, it requires TensorFlow to be installed. Please see "
diff --git a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
index 83eed843bf..3f3e48172b 100755
--- a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
@@ -18,13 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import re
-import json
 import argparse
 import torch
-import numpy as np
 
-from .modeling_openai import load_tf_weights_in_openai_gpt, OpenAIGPTConfig, OpenAIGPTModel, CONFIG_NAME, WEIGHTS_NAME
+from pytorch_pretrained_bert.modeling_openai import load_tf_weights_in_openai_gpt, OpenAIGPTConfig, OpenAIGPTModel, CONFIG_NAME, WEIGHTS_NAME
 
 def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
     # Construct model
@@ -67,5 +64,5 @@ if __name__ == "__main__":
                             "This specifies the model architecture.")
     args = parser.parse_args()
     convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
-                                         args.pytorch_dump_folder_path,
-                                         args.openai_config_file)
+                                         args.openai_config_file,
+                                         args.pytorch_dump_folder_path)
diff --git a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
index c5e3090c8d..2dbf1f296e 100755
--- a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
@@ -25,7 +25,7 @@ import tensorflow as tf
 import torch
 import numpy as np
 
-from .modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+from pytorch_pretrained_bert.modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert
 
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
     # Initialise PyTorch model
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 8d71b8e955..7f0c2bd47b 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -52,6 +52,14 @@ TF_WEIGHTS_NAME = 'model.ckpt'
 def load_tf_weights_in_bert(model, tf_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
     """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ModuleNotFoundError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
     tf_path = os.path.abspath(tf_checkpoint_path)
     print("Converting TensorFlow checkpoint from {}".format(tf_path))
     # Load weights from TF model
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index e71a3910f8..2e2dc56984 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -15,23 +15,23 @@
 # limitations under the License.
 """PyTorch OpenAI GPT model."""
 
-import os
+import collections
 import copy
 import json
-import math
 import logging
+import math
+import os
+import shutil
 import tarfile
 import tempfile
-import shutil
-import collections
 
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
-from .modeling import BertLayerNorm as LayerNorm
 from .file_utils import cached_path
+from .modeling import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
 
@@ -42,6 +42,8 @@ WEIGHTS_NAME = "pytorch_model.bin"
 def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
     """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
     """
+    import re
+    import numpy as np
     print("Loading weights...")
     names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
     shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
@@ -50,18 +52,24 @@ def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
     init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
     init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
 
-    init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
-    del init_params[1]
+    # Thsi as used when we had a single embedding matrix for positions and tokens
+    # init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
+    # del init_params[1]
     init_params = [arr.squeeze() for arr in init_params]
 
     try:
-        assert model.embed.weight.shape == init_params[0].shape
+        assert model.tokens_embed.weight.shape == init_params[1].shape
+        assert model.positions_embed.weight.shape == init_params[0].shape
     except AssertionError as e:
-        e.args += (model.embed.weight.shape, init_params[0].shape)
+        e.args += (model.tokens_embed.weight.shape, init_params[1].shape)
+        e.args += (model.positions_embed.weight.shape, init_params[0].shape)
         raise
 
-    model.embed.weight.data = torch.from_numpy(init_params[0])
+    model.tokens_embed.weight.data = torch.from_numpy(init_params[1])
+    model.positions_embed.weight.data = torch.from_numpy(init_params[0])
     names.pop(0)
+    # Pop position and token embedding arrays
+    init_params.pop(0)
     init_params.pop(0)
 
     for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]):
@@ -584,8 +592,9 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 
     def __init__(self, config):
         super(OpenAIGPTModel, self).__init__(config)
-        total_embeddings_size = config.vocab_size + config.n_special + config.n_positions
-        self.embed = nn.Embedding(total_embeddings_size, config.n_embd)
+        num_tokens = config.vocab_size + config.n_special
+        self.tokens_embed = nn.Embedding(num_tokens, config.n_embd)
+        self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
         self.drop = nn.Dropout(config.embd_pdrop)
         block = Block(config.n_ctx, config, scale=True)
         self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
@@ -598,30 +607,32 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         # Update config
         self.config.n_special = num_special_tokens
         # # Build new embeddings and initialize
-        old_embed = self.embed
-        self.embed = nn.Embedding(self.config.total_num_embeddings, self.config.n_embd)
+        old_embed = self.tokens_embed
+        self.tokens_embed = nn.Embedding(self.config.total_num_embeddings, self.config.n_embd)
         # Initialize all new embeddings (in particular the special tokens)
-        self.init_weights(self.embed)
+        self.init_weights(self.tokens_embed)
         # Copy word and positional embeddings from the previous weights
-        self.embed.weight.data[: self.config.vocab_size, :] = old_embed.weight.data[: self.config.vocab_size, :]
-        self.embed.weight.data[-self.config.n_positions :, :] = old_embed.weight.data[-self.config.n_positions :, :]
+        self.tokens_embed.weight.data[: self.config.vocab_size, :] = old_embed.weight.data[: self.config.vocab_size, :]
+        self.tokens_embed.weight.data[-self.config.n_positions :, :] = old_embed.weight.data[-self.config.n_positions :, :]
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None):
         if position_ids is None:
-            start = self.config.vocab_size + self.config.n_special
-            end = start + input_ids.size(-1)
-            position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)
+            # This was used when we had a single embedding matrice from position and token embeddings
+            # start = self.config.vocab_size + self.config.n_special
+            # end = start + input_ids.size(-1)
+            # position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)
+            position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 
         input_shape = input_ids.size()
         input_ids = input_ids.view(-1, input_ids.size(-1))
         position_ids = position_ids.view(-1, position_ids.size(-1))
 
-        inputs_embeds = self.embed(input_ids)
-        position_embeds = self.embed(position_ids)
+        inputs_embeds = self.tokens_embed(input_ids)
+        position_embeds = self.positions_embed(position_ids)
         if token_type_ids is not None:
             token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
-            token_type_embeds = self.embed(token_type_ids)
+            token_type_embeds = self.tokens_embed(token_type_ids)
         else:
             token_type_embeds = 0
         # Add the position information to the input embeddings
@@ -694,13 +705,13 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     def __init__(self, config):
         super(OpenAIGPTLMHeadModel, self).__init__(config)
         self.transformer = OpenAIGPTModel(config)
-        self.lm_head = OpenAIGPTLMHead(self.transformer.embed.weight, config)
+        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens):
         " Update input and output embeddings with new embedding matrice "
         self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
 
     def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None):
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
@@ -780,14 +791,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     def __init__(self, config):
         super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
         self.transformer = OpenAIGPTModel(config)
-        self.lm_head = OpenAIGPTLMHead(self.transformer.embed.weight, config)
+        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
         self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config)
         self.apply(self.init_weights)
 
     def set_num_special_tokens(self, num_special_tokens):
         " Update input and output embeddings with new embedding matrice "
         self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
+        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)
 
     def forward(self, input_ids, mc_token_mask, lm_labels=None, mc_labels=None, token_type_ids=None, position_ids=None):
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index 6abc68abc2..204eef738c 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -121,6 +121,13 @@ def build_tf_to_pytorch_map(model, config):
 def load_tf_weights_in_transfo_xl(model, config, tf_path):
     """ Load tf checkpoints in a pytorch model
     """
+    try:
+        import numpy as np
+        import tensorflow as tf
+    except ModuleNotFoundError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
     # Build TF to PyTorch weights loading map
     tf_to_pt_map = build_tf_to_pytorch_map(model, config)
 
diff --git a/tests/modeling_openai_test.py b/tests/modeling_openai_test.py
index 014554934c..71a1dfd3c6 100644
--- a/tests/modeling_openai_test.py
+++ b/tests/modeling_openai_test.py
@@ -39,7 +39,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
                      use_labels=True,
                      vocab_size=99,
                      n_special=1,
-                     n_ctx=33,
+                     n_positions=33,
                      n_embd=32,
                      n_layer=5,
                      n_head=4,
@@ -61,7 +61,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
             self.use_labels = use_labels
             self.vocab_size = vocab_size
             self.n_special = n_special
-            self.n_ctx = n_ctx
+            self.n_positions = n_positions
             self.n_embd = n_embd
             self.n_layer = n_layer
             self.n_head = n_head
@@ -80,12 +80,11 @@ class OpenAIGPTModelTest(unittest.TestCase):
 
             position_ids = None
             if self.use_position_ids:
-                position_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_ctx)
-                position_ids = position_ids + self.n_special + self.vocab_size
+                position_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
 
             token_type_ids = None
             if self.use_token_type_ids:
-                total_voc = self.n_ctx + self.n_special + self.vocab_size
+                total_voc = self.vocab_size + self.n_special
                 token_type_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
 
             mc_labels = None
@@ -98,7 +97,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
 
             config = OpenAIGPTConfig(
                 vocab_size_or_config_json_file=self.vocab_size,
-                n_ctx=self.n_ctx,
+                n_positions=self.n_positions,
                 n_special=self.n_special,
                 n_embd=self.n_embd,
                 n_layer=self.n_layer,
@@ -139,7 +138,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
             return outputs
 
         def check_openai_lm_head_output(self, result):
-            total_voc = self.n_ctx + self.n_special + self.vocab_size
+            total_voc = self.n_special + self.vocab_size
             self.parent.assertListEqual(
                 list(result["lm_logits"].size()),
                 [self.batch_size, self.n_choices, self.seq_length, total_voc])
@@ -164,7 +163,7 @@ class OpenAIGPTModelTest(unittest.TestCase):
             return outputs
 
         def check_openai_double_heads_output(self, result):
-            total_voc = self.n_ctx + self.n_special + self.vocab_size
+            total_voc = self.n_special + self.vocab_size
             self.parent.assertListEqual(
                 list(result["lm_logits"].size()),
                 [self.batch_size, self.n_choices, self.seq_length, total_voc])