From 96c2b77f0f48cce12364d353c6bfd15a7ce002b4 Mon Sep 17 00:00:00 2001
From: Chris <chris@Chriss-MacBook-Air.local>
Date: Thu, 2 May 2019 13:14:25 -0400
Subject: [PATCH 01/13] added file to convert pytorch->tf

---
 .../convert_hf_checkpoint_to_tf.py            | 153 ++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py

diff --git a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
new file mode 100644
index 0000000000..98e497f6f5
--- /dev/null
+++ b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
@@ -0,0 +1,153 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint."""
+
+import os
+import argparse
+import numpy as np
+from pytorch_pretrained_bert.modeling import BertConfig, BertModel
+
+
+# def __get_var_names(config):
+#
+#     models = {
+#         'BertModel': BertModel(config),
+#         'BertForMaskedLM': BertForMaskedLM(config),
+#         'BertForPreTraining': BertForPreTraining(config),
+#         'BertForMultipleChoice': BertForMultipleChoice(config, num_choices=100),
+#         'BertForNextSentencePrediction': BertForNextSentencePrediction(config),
+#         'BertForSequenceClassification': BertForSequenceClassification(config, num_labels=100),
+#         'BertForQuestionAnswering': BertForQuestionAnswering(config)
+#     }
+#
+#     for name, model in models.items():
+#         state_dict = model.state_dict()
+#         torch_vars = []
+#         for var_ in state_dict:
+#             torch_vars.append(var_ + ', ' + str(tuple(state_dict[var_].shape)))
+#         json.dump(torch_vars, fp=open('torch_var_names_{}.json'.format(name), 'w'), indent=3)
+
+
+
+def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
+
+    """
+    :param model:BertModel Pytorch model instance to be converted
+    :param ckpt_dir: directory to save Tensorflow model
+
+    Supported HF models:
+        Y BertModel
+        N BertForMaskedLM
+        N BertForPreTraining
+        N BertForMultipleChoice
+        N BertForNextSentencePrediction
+        N BertForSequenceClassification
+        N BertForQuestionAnswering
+
+    Note:
+        TF isn't & shouldn't be a package-level requirement; this
+        feature is requested enough  to warrant a local import.
+    """
+
+    import tensorflow as tf
+
+    if not os.path.isdir(ckpt_dir):
+        os.makedirs(ckpt_dir)
+
+    session = tf.Session()
+
+    state_dict = model.state_dict()
+
+    tf_vars = []
+
+    def to_tf_var_name(name:str):
+
+        """todo: compile as regex"""
+
+        name = name.replace('layer.',                       'layer_')
+        name = name.replace('word_embeddings.weight',       'word_embeddings')
+        name = name.replace('position_embeddings.weight',   'position_embeddings')
+        name = name.replace('token_type_embeddings.weight', 'token_type_embeddings')
+        name = name.replace('.',                            '/')
+        name = name.replace('LayerNorm/weight',             'LayerNorm/gamma')
+        name = name.replace('LayerNorm/bias',               'LayerNorm/beta')
+        name = name.replace('weight',                       'kernel')
+        return 'bert/{}'.format(name)
+
+    def assign_tf_var(tensor:np.ndarray, name:str):
+        tmp_var = tf.Variable(initial_value=tensor)
+        tf_var = tf.get_variable(dtype=tmp_var.dtype, shape=tmp_var.shape, name=name)
+        op = tf.assign(ref=tf_var, value=tmp_var)
+        session.run(tf.variables_initializer([tmp_var, tf_var]))
+        session.run(fetches=[op, tf_var])
+        return tf_var
+
+    for var_name in state_dict:
+
+        tf_name = to_tf_var_name(var_name)
+        torch_tensor = state_dict[var_name].numpy()
+
+        if var_name.endswith('dense.weight'):
+            torch_tensor = torch_tensor.T
+
+        tf_tensor = assign_tf_var(tensor=torch_tensor, name=tf_name)
+
+        tf_vars.append(tf_tensor)
+
+        print("{0}{1}initialized".format(tf_name, " " * (60 - len(tf_name))))
+
+    saver = tf.train.Saver(tf_vars)
+    saver.save(session, os.path.join(ckpt_dir, 'model'))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pretrained_model_name_or_path",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="pretrained_model_name_or_path: either: \
+                                - a str with the name of a pre-trained model to load selected in the list of: \
+                                    . `bert-base-uncased` \
+                                    . `bert-large-uncased` \
+                                    . `bert-base-cased` \
+                                    . `bert-large-cased` \
+                                    . `bert-base-multilingual-uncased` \
+                                    . `bert-base-multilingual-cased` \
+                                    . `bert-base-chinese` \
+                                - a path or url to a pretrained model archive containing: \
+                                    . `bert_config.json` a configuration file for the model \
+                                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance \
+                                - a path or url to a pretrained model archive containing: \
+                                    . `bert_config.json` a configuration file for the model \
+                                    . `model.ckpt` a TensorFlow checkpoint")
+    parser.add_argument("--config_file_path",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="Path to bert config file.")
+    parser.add_argument("--cache_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="path to a folder in which the TF model will be cached.")
+    args = parser.parse_args()
+
+    model = BertModel(
+        config=BertConfig(args.config_file_path)
+    ).from_pretrained(args.pretrained_model_name_or_path)
+
+    convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.cache_)

From 968c1b44cbaa36c17f6a1d453c10f125ffce64eb Mon Sep 17 00:00:00 2001
From: Chris <chris@Chriss-MacBook-Air.local>
Date: Thu, 2 May 2019 13:19:56 -0400
Subject: [PATCH 02/13] added file to convert pytorch->tf

---
 .../convert_hf_checkpoint_to_tf.py            | 23 +------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
index 98e497f6f5..73c1f6587c 100644
--- a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
+++ b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
@@ -21,26 +21,6 @@ import numpy as np
 from pytorch_pretrained_bert.modeling import BertConfig, BertModel
 
 
-# def __get_var_names(config):
-#
-#     models = {
-#         'BertModel': BertModel(config),
-#         'BertForMaskedLM': BertForMaskedLM(config),
-#         'BertForPreTraining': BertForPreTraining(config),
-#         'BertForMultipleChoice': BertForMultipleChoice(config, num_choices=100),
-#         'BertForNextSentencePrediction': BertForNextSentencePrediction(config),
-#         'BertForSequenceClassification': BertForSequenceClassification(config, num_labels=100),
-#         'BertForQuestionAnswering': BertForQuestionAnswering(config)
-#     }
-#
-#     for name, model in models.items():
-#         state_dict = model.state_dict()
-#         torch_vars = []
-#         for var_ in state_dict:
-#             torch_vars.append(var_ + ', ' + str(tuple(state_dict[var_].shape)))
-#         json.dump(torch_vars, fp=open('torch_var_names_{}.json'.format(name), 'w'), indent=3)
-
-
 
 def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
 
@@ -58,8 +38,7 @@ def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
         N BertForQuestionAnswering
 
     Note:
-        TF isn't & shouldn't be a package-level requirement; this
-        feature is requested enough  to warrant a local import.
+        To keep TF out of package-level requirements, tf is imported locally.
     """
 
     import tensorflow as tf

From 0a8b4d65beed45d167735a3ecf8ee5d4a5d1b2a3 Mon Sep 17 00:00:00 2001
From: Chris <chris@Chriss-MacBook-Air.local>
Date: Thu, 2 May 2019 13:20:59 -0400
Subject: [PATCH 03/13] added file to convert pytorch->tf

---
 pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
index 73c1f6587c..a8f2e3f8d0 100644
--- a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
+++ b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
@@ -21,7 +21,6 @@ import numpy as np
 from pytorch_pretrained_bert.modeling import BertConfig, BertModel
 
 
-
 def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
 
     """
@@ -129,4 +128,4 @@ if __name__ == "__main__":
         config=BertConfig(args.config_file_path)
     ).from_pretrained(args.pretrained_model_name_or_path)
 
-    convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.cache_)
+    convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.cache_)
\ No newline at end of file

From 41089bc7d339b30ca0542b3ed4096d37b7a6eec6 Mon Sep 17 00:00:00 2001
From: Chris <chris@Chriss-MacBook-Air.local>
Date: Thu, 2 May 2019 13:26:22 -0400
Subject: [PATCH 04/13] added file to convert pytorch->tf

---
 pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
index a8f2e3f8d0..44c860da15 100644
--- a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
+++ b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
@@ -121,11 +121,11 @@ if __name__ == "__main__":
                         default=None,
                         type=str,
                         required=True,
-                        help="path to a folder in which the TF model will be cached.")
+                        help="Path to a folder in which the TF model will be cached.")
     args = parser.parse_args()
 
     model = BertModel(
         config=BertConfig(args.config_file_path)
     ).from_pretrained(args.pretrained_model_name_or_path)
 
-    convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.cache_)
\ No newline at end of file
+    convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.cache_dir)
\ No newline at end of file

From 2bcda8d00c672ba402d8bc8a2b1a7e9079fac0e3 Mon Sep 17 00:00:00 2001
From: Chris <chris@Chriss-MacBook-Air.local>
Date: Sat, 18 May 2019 15:55:11 -0400
Subject: [PATCH 05/13] update

---
 pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
index 44c860da15..8673c94196 100644
--- a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
+++ b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
@@ -37,7 +37,7 @@ def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
         N BertForQuestionAnswering
 
     Note:
-        To keep TF out of package-level requirements, tf is imported locally.
+        To keep tf out of package-level requirements, it's imported locally.
     """
 
     import tensorflow as tf
@@ -52,9 +52,7 @@ def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
     tf_vars = []
 
     def to_tf_var_name(name:str):
-
         """todo: compile as regex"""
-
         name = name.replace('layer.',                       'layer_')
         name = name.replace('word_embeddings.weight',       'word_embeddings')
         name = name.replace('position_embeddings.weight',   'position_embeddings')
@@ -74,17 +72,12 @@ def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
         return tf_var
 
     for var_name in state_dict:
-
         tf_name = to_tf_var_name(var_name)
         torch_tensor = state_dict[var_name].numpy()
-
         if var_name.endswith('dense.weight'):
             torch_tensor = torch_tensor.T
-
         tf_tensor = assign_tf_var(tensor=torch_tensor, name=tf_name)
-
         tf_vars.append(tf_tensor)
-
         print("{0}{1}initialized".format(tf_name, " " * (60 - len(tf_name))))
 
     saver = tf.train.Saver(tf_vars)

From f1433db4f16f8f485bd1352d581872d2fc4a0cc0 Mon Sep 17 00:00:00 2001
From: Chris <chris@Chriss-MacBook-Air.local>
Date: Sat, 18 May 2019 17:09:08 -0400
Subject: [PATCH 06/13] update to hf->tf args

---
 .../convert_hf_checkpoint_to_tf.py            | 53 +++++++------------
 1 file changed, 20 insertions(+), 33 deletions(-)

diff --git a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
index 8673c94196..16b95f1454 100644
--- a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
+++ b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
@@ -18,16 +18,18 @@
 import os
 import argparse
 import numpy as np
+import tensorflow as tf
 from pytorch_pretrained_bert.modeling import BertConfig, BertModel
 
 
-def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
+def convert_hf_checkpoint_to_tf(model:type(BertModel), ckpt_dir:str):
 
     """
     :param model:BertModel Pytorch model instance to be converted
     :param ckpt_dir: directory to save Tensorflow model
+    :return:
 
-    Supported HF models:
+    Currently supported HF models:
         Y BertModel
         N BertForMaskedLM
         N BertForPreTraining
@@ -35,20 +37,13 @@ def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
         N BertForNextSentencePrediction
         N BertForSequenceClassification
         N BertForQuestionAnswering
-
-    Note:
-        To keep tf out of package-level requirements, it's imported locally.
     """
 
-    import tensorflow as tf
-
     if not os.path.isdir(ckpt_dir):
         os.makedirs(ckpt_dir)
 
     session = tf.Session()
-
     state_dict = model.state_dict()
-
     tf_vars = []
 
     def to_tf_var_name(name:str):
@@ -61,6 +56,7 @@ def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
         name = name.replace('LayerNorm/weight',             'LayerNorm/gamma')
         name = name.replace('LayerNorm/bias',               'LayerNorm/beta')
         name = name.replace('weight',                       'kernel')
+        # name += ':0'
         return 'bert/{}'.format(name)
 
     def assign_tf_var(tensor:np.ndarray, name:str):
@@ -81,44 +77,35 @@ def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
         print("{0}{1}initialized".format(tf_name, " " * (60 - len(tf_name))))
 
     saver = tf.train.Saver(tf_vars)
-    saver.save(session, os.path.join(ckpt_dir, 'model'))
+    saver.save(session, os.path.join(ckpt_dir, args.pytorch_model_name))
 
 
 if __name__ == "__main__":
+
     parser = argparse.ArgumentParser()
-    parser.add_argument("--pretrained_model_name_or_path",
+    parser.add_argument("--pytorch_model_dir",
                         default=None,
                         type=str,
                         required=True,
-                        help="pretrained_model_name_or_path: either: \
-                                - a str with the name of a pre-trained model to load selected in the list of: \
-                                    . `bert-base-uncased` \
-                                    . `bert-large-uncased` \
-                                    . `bert-base-cased` \
-                                    . `bert-large-cased` \
-                                    . `bert-base-multilingual-uncased` \
-                                    . `bert-base-multilingual-cased` \
-                                    . `bert-base-chinese` \
-                                - a path or url to a pretrained model archive containing: \
-                                    . `bert_config.json` a configuration file for the model \
-                                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance \
-                                - a path or url to a pretrained model archive containing: \
-                                    . `bert_config.json` a configuration file for the model \
-                                    . `model.ckpt` a TensorFlow checkpoint")
+                        help="Directory containing pytorch model")
+    parser.add_argument("--pytorch_model_name",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="model name (e.g. bert-base-uncased)")
     parser.add_argument("--config_file_path",
                         default=None,
                         type=str,
                         required=True,
-                        help="Path to bert config file.")
-    parser.add_argument("--cache_dir",
-                        default=None,
+                        help="Path to bert config file")
+    parser.add_argument("--tf_checkpoint_dir",
+                        default="",
                         type=str,
                         required=True,
-                        help="Path to a folder in which the TF model will be cached.")
+                        help="Directory in which to save tensorflow model")
     args = parser.parse_args()
 
     model = BertModel(
         config=BertConfig(args.config_file_path)
-    ).from_pretrained(args.pretrained_model_name_or_path)
-
-    convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.cache_dir)
\ No newline at end of file
+    ).from_pretrained(args.pytorch_model_name, cache_dir=args.pytorch_model_dir)
+    convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.tf_checkpoint_dir)
\ No newline at end of file

From 69749f3fc330f954b31a47f51a177c80064aaa01 Mon Sep 17 00:00:00 2001
From: Chris <chris@Chriss-MacBook-Air.local>
Date: Sat, 18 May 2019 17:16:01 -0400
Subject: [PATCH 07/13] update to hf->tf args

---
 pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
index 16b95f1454..41327de891 100644
--- a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
+++ b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
@@ -108,4 +108,4 @@ if __name__ == "__main__":
     model = BertModel(
         config=BertConfig(args.config_file_path)
     ).from_pretrained(args.pytorch_model_name, cache_dir=args.pytorch_model_dir)
-    convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.tf_checkpoint_dir)
\ No newline at end of file
+    convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.tf_checkpoint_dir)

From a309459b92348f2a61458a464cf5eec3dd0994bc Mon Sep 17 00:00:00 2001
From: Chris <chris@Chriss-MacBook-Air.local>
Date: Wed, 22 May 2019 20:17:27 -0400
Subject: [PATCH 08/13] fn change; pytorch_model_dir required=False

---
 ...checkpoint_to_tf.py => convert_pytorch_checkpoint_to_tf.py} | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
 rename pytorch_pretrained_bert/{convert_hf_checkpoint_to_tf.py => convert_pytorch_checkpoint_to_tf.py} (98%)

diff --git a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
similarity index 98%
rename from pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
rename to pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
index 41327de891..870b5ee5db 100644
--- a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py
+++ b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
@@ -56,7 +56,6 @@ def convert_hf_checkpoint_to_tf(model:type(BertModel), ckpt_dir:str):
         name = name.replace('LayerNorm/weight',             'LayerNorm/gamma')
         name = name.replace('LayerNorm/bias',               'LayerNorm/beta')
         name = name.replace('weight',                       'kernel')
-        # name += ':0'
         return 'bert/{}'.format(name)
 
     def assign_tf_var(tensor:np.ndarray, name:str):
@@ -86,7 +85,7 @@ if __name__ == "__main__":
     parser.add_argument("--pytorch_model_dir",
                         default=None,
                         type=str,
-                        required=True,
+                        required=False,
                         help="Directory containing pytorch model")
     parser.add_argument("--pytorch_model_name",
                         default=None,

From d0adab2c39dc486c548c0b61ad8471e27e60bd36 Mon Sep 17 00:00:00 2001
From: Chris <chris@Chriss-MacBook-Air.local>
Date: Wed, 22 May 2019 20:24:04 -0400
Subject: [PATCH 09/13] fn change; pytorch_model_dir required=False

---
 pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
index 870b5ee5db..a17d058664 100644
--- a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
+++ b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
@@ -22,7 +22,7 @@ import tensorflow as tf
 from pytorch_pretrained_bert.modeling import BertConfig, BertModel
 
 
-def convert_hf_checkpoint_to_tf(model:type(BertModel), ckpt_dir:str):
+def convert_pytorch_checkpoint_to_tf(model:type(BertModel), ckpt_dir:str):
 
     """
     :param model:BertModel Pytorch model instance to be converted
@@ -107,4 +107,4 @@ if __name__ == "__main__":
     model = BertModel(
         config=BertConfig(args.config_file_path)
     ).from_pretrained(args.pytorch_model_name, cache_dir=args.pytorch_model_dir)
-    convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.tf_checkpoint_dir)
+    convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_checkpoint_dir)

From 8de1faea6fe5df0477afedc2112ae19d3c6dc4ee Mon Sep 17 00:00:00 2001
From: Chris <chris@Chriss-MacBook-Air.local>
Date: Wed, 22 May 2019 20:38:16 -0400
Subject: [PATCH 10/13] update to hf->tf args

---
 pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
index a17d058664..b845fa8530 100644
--- a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
+++ b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
@@ -22,7 +22,7 @@ import tensorflow as tf
 from pytorch_pretrained_bert.modeling import BertConfig, BertModel
 
 
-def convert_pytorch_checkpoint_to_tf(model:type(BertModel), ckpt_dir:str):
+def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
 
     """
     :param model:BertModel Pytorch model instance to be converted

From 314bc6bb4e4bd2ede16cd7c04b3b2a419611d190 Mon Sep 17 00:00:00 2001
From: Chris <chris@Chriss-MacBook-Air.local>
Date: Mon, 27 May 2019 09:47:59 -0400
Subject: [PATCH 11/13] added transposes to attention.self.[query,key,value]

---
 .../convert_pytorch_checkpoint_to_tf.py       | 31 +++++++++++++------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
index b845fa8530..a9bfdaa45c 100644
--- a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
+++ b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
@@ -39,6 +39,24 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
         N BertForQuestionAnswering
     """
 
+    tensors_to_transopse = (
+        "dense.weight",
+        "attention.self.query",
+        "attention.self.key",
+        "attention.self.value"
+    )
+
+    var_map = (
+        ('layer.', 'layer_'),
+        ('word_embeddings.weight', 'word_embeddings'),
+        ('position_embeddings.weight', 'position_embeddings'),
+        ('token_type_embeddings.weight', 'token_type_embeddings'),
+        ('.', '/'),
+        ('LayerNorm/weight', 'LayerNorm/gamma'),
+        ('LayerNorm/bias', 'LayerNorm/beta'),
+        ('weight', 'kernel')
+    )
+
     if not os.path.isdir(ckpt_dir):
         os.makedirs(ckpt_dir)
 
@@ -47,15 +65,8 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
     tf_vars = []
 
     def to_tf_var_name(name:str):
-        """todo: compile as regex"""
-        name = name.replace('layer.',                       'layer_')
-        name = name.replace('word_embeddings.weight',       'word_embeddings')
-        name = name.replace('position_embeddings.weight',   'position_embeddings')
-        name = name.replace('token_type_embeddings.weight', 'token_type_embeddings')
-        name = name.replace('.',                            '/')
-        name = name.replace('LayerNorm/weight',             'LayerNorm/gamma')
-        name = name.replace('LayerNorm/bias',               'LayerNorm/beta')
-        name = name.replace('weight',                       'kernel')
+        for patt, repl in iter(var_map):
+            name = name.replace(patt, repl)
         return 'bert/{}'.format(name)
 
     def assign_tf_var(tensor:np.ndarray, name:str):
@@ -69,7 +80,7 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
     for var_name in state_dict:
         tf_name = to_tf_var_name(var_name)
         torch_tensor = state_dict[var_name].numpy()
-        if var_name.endswith('dense.weight'):
+        if any([x in var_name for x in tensors_to_transopse]):
             torch_tensor = torch_tensor.T
         tf_tensor = assign_tf_var(tensor=torch_tensor, name=tf_name)
         tf_vars.append(tf_tensor)

From a8e071c6900b78ff14fbe14df75fd79ab86338fa Mon Sep 17 00:00:00 2001
From: chrislarson1 <cl966@cornell.edu>
Date: Wed, 19 Jun 2019 23:08:08 -0400
Subject: [PATCH 12/13] added notebook to check correctness of the
 pytorch->tensorflow conversion

---
 notebooks/Comparing-PT-and-TF-models.ipynb | 1630 ++++++++++++++++++++
 1 file changed, 1630 insertions(+)
 create mode 100644 notebooks/Comparing-PT-and-TF-models.ipynb

diff --git a/notebooks/Comparing-PT-and-TF-models.ipynb b/notebooks/Comparing-PT-and-TF-models.ipynb
new file mode 100644
index 0000000000..321c2ebe30
--- /dev/null
+++ b/notebooks/Comparing-PT-and-TF-models.ipynb
@@ -0,0 +1,1630 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pytorch to Tensorflow Conversion Test Notebook\n",
+    "\n",
+    "To run this notebook follow these steps, modifying the **Config** section as necessary:\n",
+    "\n",
+    "1. Point `pt_model_dir` to your local directory containing the pytorch Bert model to be converted.\n",
+    "2. Point `tf_bert_dir` to your clone of Google's Bert implementation which can be found here: https://github.com/google-research/bert.\n",
+    "\n",
+    "Note: \n",
+    "1. This feature currently only supports the base BERT models (uncased/cased).\n",
+    "2. Tensorflow model will be dumped in `tf_model_dir`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "model_cls  = 'BertModel'\n",
+    "model_typ  = 'bert-base-uncased'\n",
+    "token_cls  = 'BertTokenizer'\n",
+    "max_seq    = 12\n",
+    "CLS        = \"[CLS]\"\n",
+    "SEP        = \"[SEP]\"\n",
+    "MASK       = \"[MASK]\"\n",
+    "CLS_IDX    = 0\n",
+    "layer_idxs = tuple(range(12))\n",
+    "input_text = \"jim henson was a puppeteer\"\n",
+    "\n",
+    "pt_model_dir = \"/home/ubuntu/.pytorch-pretrained-BERT-cache/{}\".format(model_typ)\n",
+    "tf_bert_dir  = \"/home/ubuntu/bert\"\n",
+    "\n",
+    "pt_vocab_file  = os.path.join(pt_model_dir, \"vocab.txt\")\n",
+    "pt_init_ckpt   = os.path.join(pt_model_dir, model_typ.replace(\"-\", \"_\") + \".bin\")\n",
+    "tf_model_dir   = os.path.join(pt_model_dir, 'tf')\n",
+    "tf_vocab_file  = os.path.join(tf_model_dir, \"vocab.txt\")\n",
+    "tf_init_ckpt   = os.path.join(tf_model_dir, model_typ.replace(\"-\", \"_\") + \".ckpt\")\n",
+    "tf_config_file = os.path.join(tf_model_dir, \"bert_config.json\")\n",
+    "\n",
+    "if not os.path.isdir(tf_model_dir): \n",
+    "    os.makedirs(tf_model_dir, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Tokenization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize(text, tokenizer):\n",
+    "    text = text.strip().lower()\n",
+    "    tok_ids = tokenizer.tokenize(text)\n",
+    "    if len(tok_ids) > max_seq - 2:\n",
+    "        tok_ids = tok_ids[:max_seq - 2]\n",
+    "    tok_ids.insert(CLS_IDX, CLS)\n",
+    "    tok_ids.append(SEP)\n",
+    "    input_ids = tokenizer.convert_tokens_to_ids(tok_ids)\n",
+    "    mask_ids = [1] * len(input_ids)\n",
+    "    seg_ids = [0] * len(input_ids)\n",
+    "    padding = [0] * (max_seq - len(input_ids))\n",
+    "    input_ids += padding\n",
+    "    mask_ids += padding\n",
+    "    seg_ids += padding\n",
+    "    return input_ids, mask_ids, seg_ids"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Pytorch execution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 231508/231508 [00:00<00:00, 41092464.26B/s]\n",
+      "100%|██████████| 407873900/407873900 [00:07<00:00, 58092479.52B/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pytorch embedding shape: (1, 768)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import torch\n",
+    "from pytorch_pretrained_bert import (BertConfig,\n",
+    "                                     BertModel, \n",
+    "                                     BertTokenizer, \n",
+    "                                     BertForSequenceClassification)\n",
+    "\n",
+    "# Save Vocab\n",
+    "pt_tokenizer = BertTokenizer.from_pretrained(\n",
+    "    pretrained_model_name_or_path=model_typ, \n",
+    "    cache_dir=pt_model_dir)\n",
+    "pt_tokenizer.save_vocabulary(pt_model_dir)\n",
+    "pt_tokenizer.save_vocabulary(tf_model_dir)\n",
+    "\n",
+    "# Save Model\n",
+    "pt_model = BertModel.from_pretrained(\n",
+    "    pretrained_model_name_or_path=model_typ, \n",
+    "    cache_dir=pt_model_dir).to('cpu')\n",
+    "pt_model.eval()\n",
+    "pt_model.config.hidden_dropout_prob = 0.0\n",
+    "pt_model.config.attention_probs_dropout_prob = 0.0\n",
+    "pt_model.config.to_json_file(tf_config_file)\n",
+    "torch.save(pt_model.state_dict(), pt_init_ckpt)\n",
+    "\n",
+    "# Inputs\n",
+    "input_ids_pt, mask_ids_pt, seg_ids_pt = tokenize(input_text, pt_tokenizer)\n",
+    "\n",
+    "# PT Embedding\n",
+    "tok_tensor = torch.tensor(input_ids_pt).to('cpu').unsqueeze(0)\n",
+    "seg_tensor = torch.tensor(seg_ids_pt).to('cpu').unsqueeze(0)\n",
+    "msk_tensor = torch.tensor(mask_ids_pt).to('cpu').unsqueeze(0)\n",
+    "attn_blks, nsp_logits = pt_model(tok_tensor, seg_tensor, msk_tensor)\n",
+    "pt_embedding = nsp_logits.detach().numpy() \n",
+    "print(\"Pytorch embedding shape: {}\".format(pt_embedding.shape))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Pytorch &rarr; Tensorflow conversion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/nlp/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Colocations handled automatically by placer.\n",
+      "bert/embeddings/word_embeddings                             initialized\n",
+      "bert/embeddings/position_embeddings                         initialized\n",
+      "bert/embeddings/token_type_embeddings                       initialized\n",
+      "bert/embeddings/LayerNorm/gamma                             initialized\n",
+      "bert/embeddings/LayerNorm/beta                              initialized\n",
+      "bert/encoder/layer_0/attention/self/query/kernel            initialized\n",
+      "bert/encoder/layer_0/attention/self/query/bias              initialized\n",
+      "bert/encoder/layer_0/attention/self/key/kernel              initialized\n",
+      "bert/encoder/layer_0/attention/self/key/bias                initialized\n",
+      "bert/encoder/layer_0/attention/self/value/kernel            initialized\n",
+      "bert/encoder/layer_0/attention/self/value/bias              initialized\n",
+      "bert/encoder/layer_0/attention/output/dense/kernel          initialized\n",
+      "bert/encoder/layer_0/attention/output/dense/bias            initialized\n",
+      "bert/encoder/layer_0/attention/output/LayerNorm/gamma       initialized\n",
+      "bert/encoder/layer_0/attention/output/LayerNorm/beta        initialized\n",
+      "bert/encoder/layer_0/intermediate/dense/kernel              initialized\n",
+      "bert/encoder/layer_0/intermediate/dense/bias                initialized\n",
+      "bert/encoder/layer_0/output/dense/kernel                    initialized\n",
+      "bert/encoder/layer_0/output/dense/bias                      initialized\n",
+      "bert/encoder/layer_0/output/LayerNorm/gamma                 initialized\n",
+      "bert/encoder/layer_0/output/LayerNorm/beta                  initialized\n",
+      "bert/encoder/layer_1/attention/self/query/kernel            initialized\n",
+      "bert/encoder/layer_1/attention/self/query/bias              initialized\n",
+      "bert/encoder/layer_1/attention/self/key/kernel              initialized\n",
+      "bert/encoder/layer_1/attention/self/key/bias                initialized\n",
+      "bert/encoder/layer_1/attention/self/value/kernel            initialized\n",
+      "bert/encoder/layer_1/attention/self/value/bias              initialized\n",
+      "bert/encoder/layer_1/attention/output/dense/kernel          initialized\n",
+      "bert/encoder/layer_1/attention/output/dense/bias            initialized\n",
+      "bert/encoder/layer_1/attention/output/LayerNorm/gamma       initialized\n",
+      "bert/encoder/layer_1/attention/output/LayerNorm/beta        initialized\n",
+      "bert/encoder/layer_1/intermediate/dense/kernel              initialized\n",
+      "bert/encoder/layer_1/intermediate/dense/bias                initialized\n",
+      "bert/encoder/layer_1/output/dense/kernel                    initialized\n",
+      "bert/encoder/layer_1/output/dense/bias                      initialized\n",
+      "bert/encoder/layer_1/output/LayerNorm/gamma                 initialized\n",
+      "bert/encoder/layer_1/output/LayerNorm/beta                  initialized\n",
+      "bert/encoder/layer_2/attention/self/query/kernel            initialized\n",
+      "bert/encoder/layer_2/attention/self/query/bias              initialized\n",
+      "bert/encoder/layer_2/attention/self/key/kernel              initialized\n",
+      "bert/encoder/layer_2/attention/self/key/bias                initialized\n",
+      "bert/encoder/layer_2/attention/self/value/kernel            initialized\n",
+      "bert/encoder/layer_2/attention/self/value/bias              initialized\n",
+      "bert/encoder/layer_2/attention/output/dense/kernel          initialized\n",
+      "bert/encoder/layer_2/attention/output/dense/bias            initialized\n",
+      "bert/encoder/layer_2/attention/output/LayerNorm/gamma       initialized\n",
+      "bert/encoder/layer_2/attention/output/LayerNorm/beta        initialized\n",
+      "bert/encoder/layer_2/intermediate/dense/kernel              initialized\n",
+      "bert/encoder/layer_2/intermediate/dense/bias                initialized\n",
+      "bert/encoder/layer_2/output/dense/kernel                    initialized\n",
+      "bert/encoder/layer_2/output/dense/bias                      initialized\n",
+      "bert/encoder/layer_2/output/LayerNorm/gamma                 initialized\n",
+      "bert/encoder/layer_2/output/LayerNorm/beta                  initialized\n",
+      "bert/encoder/layer_3/attention/self/query/kernel            initialized\n",
+      "bert/encoder/layer_3/attention/self/query/bias              initialized\n",
+      "bert/encoder/layer_3/attention/self/key/kernel              initialized\n",
+      "bert/encoder/layer_3/attention/self/key/bias                initialized\n",
+      "bert/encoder/layer_3/attention/self/value/kernel            initialized\n",
+      "bert/encoder/layer_3/attention/self/value/bias              initialized\n",
+      "bert/encoder/layer_3/attention/output/dense/kernel          initialized\n",
+      "bert/encoder/layer_3/attention/output/dense/bias            initialized\n",
+      "bert/encoder/layer_3/attention/output/LayerNorm/gamma       initialized\n",
+      "bert/encoder/layer_3/attention/output/LayerNorm/beta        initialized\n",
+      "bert/encoder/layer_3/intermediate/dense/kernel              initialized\n",
+      "bert/encoder/layer_3/intermediate/dense/bias                initialized\n",
+      "bert/encoder/layer_3/output/dense/kernel                    initialized\n",
+      "bert/encoder/layer_3/output/dense/bias                      initialized\n",
+      "bert/encoder/layer_3/output/LayerNorm/gamma                 initialized\n",
+      "bert/encoder/layer_3/output/LayerNorm/beta                  initialized\n",
+      "bert/encoder/layer_4/attention/self/query/kernel            initialized\n",
+      "bert/encoder/layer_4/attention/self/query/bias              initialized\n",
+      "bert/encoder/layer_4/attention/self/key/kernel              initialized\n",
+      "bert/encoder/layer_4/attention/self/key/bias                initialized\n",
+      "bert/encoder/layer_4/attention/self/value/kernel            initialized\n",
+      "bert/encoder/layer_4/attention/self/value/bias              initialized\n",
+      "bert/encoder/layer_4/attention/output/dense/kernel          initialized\n",
+      "bert/encoder/layer_4/attention/output/dense/bias            initialized\n",
+      "bert/encoder/layer_4/attention/output/LayerNorm/gamma       initialized\n",
+      "bert/encoder/layer_4/attention/output/LayerNorm/beta        initialized\n",
+      "bert/encoder/layer_4/intermediate/dense/kernel              initialized\n",
+      "bert/encoder/layer_4/intermediate/dense/bias                initialized\n",
+      "bert/encoder/layer_4/output/dense/kernel                    initialized\n",
+      "bert/encoder/layer_4/output/dense/bias                      initialized\n",
+      "bert/encoder/layer_4/output/LayerNorm/gamma                 initialized\n",
+      "bert/encoder/layer_4/output/LayerNorm/beta                  initialized\n",
+      "bert/encoder/layer_5/attention/self/query/kernel            initialized\n",
+      "bert/encoder/layer_5/attention/self/query/bias              initialized\n",
+      "bert/encoder/layer_5/attention/self/key/kernel              initialized\n",
+      "bert/encoder/layer_5/attention/self/key/bias                initialized\n",
+      "bert/encoder/layer_5/attention/self/value/kernel            initialized\n",
+      "bert/encoder/layer_5/attention/self/value/bias              initialized\n",
+      "bert/encoder/layer_5/attention/output/dense/kernel          initialized\n",
+      "bert/encoder/layer_5/attention/output/dense/bias            initialized\n",
+      "bert/encoder/layer_5/attention/output/LayerNorm/gamma       initialized\n",
+      "bert/encoder/layer_5/attention/output/LayerNorm/beta        initialized\n",
+      "bert/encoder/layer_5/intermediate/dense/kernel              initialized\n",
+      "bert/encoder/layer_5/intermediate/dense/bias                initialized\n",
+      "bert/encoder/layer_5/output/dense/kernel                    initialized\n",
+      "bert/encoder/layer_5/output/dense/bias                      initialized\n",
+      "bert/encoder/layer_5/output/LayerNorm/gamma                 initialized\n",
+      "bert/encoder/layer_5/output/LayerNorm/beta                  initialized\n",
+      "bert/encoder/layer_6/attention/self/query/kernel            initialized\n",
+      "bert/encoder/layer_6/attention/self/query/bias              initialized\n",
+      "bert/encoder/layer_6/attention/self/key/kernel              initialized\n",
+      "bert/encoder/layer_6/attention/self/key/bias                initialized\n",
+      "bert/encoder/layer_6/attention/self/value/kernel            initialized\n",
+      "bert/encoder/layer_6/attention/self/value/bias              initialized\n",
+      "bert/encoder/layer_6/attention/output/dense/kernel          initialized\n",
+      "bert/encoder/layer_6/attention/output/dense/bias            initialized\n",
+      "bert/encoder/layer_6/attention/output/LayerNorm/gamma       initialized\n",
+      "bert/encoder/layer_6/attention/output/LayerNorm/beta        initialized\n",
+      "bert/encoder/layer_6/intermediate/dense/kernel              initialized\n",
+      "bert/encoder/layer_6/intermediate/dense/bias                initialized\n",
+      "bert/encoder/layer_6/output/dense/kernel                    initialized\n",
+      "bert/encoder/layer_6/output/dense/bias                      initialized\n",
+      "bert/encoder/layer_6/output/LayerNorm/gamma                 initialized\n",
+      "bert/encoder/layer_6/output/LayerNorm/beta                  initialized\n",
+      "bert/encoder/layer_7/attention/self/query/kernel            initialized\n",
+      "bert/encoder/layer_7/attention/self/query/bias              initialized\n",
+      "bert/encoder/layer_7/attention/self/key/kernel              initialized\n",
+      "bert/encoder/layer_7/attention/self/key/bias                initialized\n",
+      "bert/encoder/layer_7/attention/self/value/kernel            initialized\n",
+      "bert/encoder/layer_7/attention/self/value/bias              initialized\n",
+      "bert/encoder/layer_7/attention/output/dense/kernel          initialized\n",
+      "bert/encoder/layer_7/attention/output/dense/bias            initialized\n",
+      "bert/encoder/layer_7/attention/output/LayerNorm/gamma       initialized\n",
+      "bert/encoder/layer_7/attention/output/LayerNorm/beta        initialized\n",
+      "bert/encoder/layer_7/intermediate/dense/kernel              initialized\n",
+      "bert/encoder/layer_7/intermediate/dense/bias                initialized\n",
+      "bert/encoder/layer_7/output/dense/kernel                    initialized\n",
+      "bert/encoder/layer_7/output/dense/bias                      initialized\n",
+      "bert/encoder/layer_7/output/LayerNorm/gamma                 initialized\n",
+      "bert/encoder/layer_7/output/LayerNorm/beta                  initialized\n",
+      "bert/encoder/layer_8/attention/self/query/kernel            initialized\n",
+      "bert/encoder/layer_8/attention/self/query/bias              initialized\n",
+      "bert/encoder/layer_8/attention/self/key/kernel              initialized\n",
+      "bert/encoder/layer_8/attention/self/key/bias                initialized\n",
+      "bert/encoder/layer_8/attention/self/value/kernel            initialized\n",
+      "bert/encoder/layer_8/attention/self/value/bias              initialized\n",
+      "bert/encoder/layer_8/attention/output/dense/kernel          initialized\n",
+      "bert/encoder/layer_8/attention/output/dense/bias            initialized\n",
+      "bert/encoder/layer_8/attention/output/LayerNorm/gamma       initialized\n",
+      "bert/encoder/layer_8/attention/output/LayerNorm/beta        initialized\n",
+      "bert/encoder/layer_8/intermediate/dense/kernel              initialized\n",
+      "bert/encoder/layer_8/intermediate/dense/bias                initialized\n",
+      "bert/encoder/layer_8/output/dense/kernel                    initialized\n",
+      "bert/encoder/layer_8/output/dense/bias                      initialized\n",
+      "bert/encoder/layer_8/output/LayerNorm/gamma                 initialized\n",
+      "bert/encoder/layer_8/output/LayerNorm/beta                  initialized\n",
+      "bert/encoder/layer_9/attention/self/query/kernel            initialized\n",
+      "bert/encoder/layer_9/attention/self/query/bias              initialized\n",
+      "bert/encoder/layer_9/attention/self/key/kernel              initialized\n",
+      "bert/encoder/layer_9/attention/self/key/bias                initialized\n",
+      "bert/encoder/layer_9/attention/self/value/kernel            initialized\n",
+      "bert/encoder/layer_9/attention/self/value/bias              initialized\n",
+      "bert/encoder/layer_9/attention/output/dense/kernel          initialized\n",
+      "bert/encoder/layer_9/attention/output/dense/bias            initialized\n",
+      "bert/encoder/layer_9/attention/output/LayerNorm/gamma       initialized\n",
+      "bert/encoder/layer_9/attention/output/LayerNorm/beta        initialized\n",
+      "bert/encoder/layer_9/intermediate/dense/kernel              initialized\n",
+      "bert/encoder/layer_9/intermediate/dense/bias                initialized\n",
+      "bert/encoder/layer_9/output/dense/kernel                    initialized\n",
+      "bert/encoder/layer_9/output/dense/bias                      initialized\n",
+      "bert/encoder/layer_9/output/LayerNorm/gamma                 initialized\n",
+      "bert/encoder/layer_9/output/LayerNorm/beta                  initialized\n",
+      "bert/encoder/layer_10/attention/self/query/kernel           initialized\n",
+      "bert/encoder/layer_10/attention/self/query/bias             initialized\n",
+      "bert/encoder/layer_10/attention/self/key/kernel             initialized\n",
+      "bert/encoder/layer_10/attention/self/key/bias               initialized\n",
+      "bert/encoder/layer_10/attention/self/value/kernel           initialized\n",
+      "bert/encoder/layer_10/attention/self/value/bias             initialized\n",
+      "bert/encoder/layer_10/attention/output/dense/kernel         initialized\n",
+      "bert/encoder/layer_10/attention/output/dense/bias           initialized\n",
+      "bert/encoder/layer_10/attention/output/LayerNorm/gamma      initialized\n",
+      "bert/encoder/layer_10/attention/output/LayerNorm/beta       initialized\n",
+      "bert/encoder/layer_10/intermediate/dense/kernel             initialized\n",
+      "bert/encoder/layer_10/intermediate/dense/bias               initialized\n",
+      "bert/encoder/layer_10/output/dense/kernel                   initialized\n",
+      "bert/encoder/layer_10/output/dense/bias                     initialized\n",
+      "bert/encoder/layer_10/output/LayerNorm/gamma                initialized\n",
+      "bert/encoder/layer_10/output/LayerNorm/beta                 initialized\n",
+      "bert/encoder/layer_11/attention/self/query/kernel           initialized\n",
+      "bert/encoder/layer_11/attention/self/query/bias             initialized\n",
+      "bert/encoder/layer_11/attention/self/key/kernel             initialized\n",
+      "bert/encoder/layer_11/attention/self/key/bias               initialized\n",
+      "bert/encoder/layer_11/attention/self/value/kernel           initialized\n",
+      "bert/encoder/layer_11/attention/self/value/bias             initialized\n",
+      "bert/encoder/layer_11/attention/output/dense/kernel         initialized\n",
+      "bert/encoder/layer_11/attention/output/dense/bias           initialized\n",
+      "bert/encoder/layer_11/attention/output/LayerNorm/gamma      initialized\n",
+      "bert/encoder/layer_11/attention/output/LayerNorm/beta       initialized\n",
+      "bert/encoder/layer_11/intermediate/dense/kernel             initialized\n",
+      "bert/encoder/layer_11/intermediate/dense/bias               initialized\n",
+      "bert/encoder/layer_11/output/dense/kernel                   initialized\n",
+      "bert/encoder/layer_11/output/dense/bias                     initialized\n",
+      "bert/encoder/layer_11/output/LayerNorm/gamma                initialized\n",
+      "bert/encoder/layer_11/output/LayerNorm/beta                 initialized\n",
+      "bert/pooler/dense/kernel                                    initialized\n",
+      "bert/pooler/dense/bias                                      initialized\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pytorch_pretrained_bert.convert_pytorch_checkpoint_to_tf import main\n",
+    "\n",
+    "main([\n",
+    "    '--model_name', model_typ, \n",
+    "    '--pytorch_model_path', pt_init_ckpt,\n",
+    "    '--tf_cache_dir', tf_model_dir,\n",
+    "    '--cache_dir', pt_model_dir\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tensorflow execution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.\n",
+      "For more information, please see:\n",
+      "  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n",
+      "  * https://github.com/tensorflow/addons\n",
+      "If you depend on functionality not listed there, please file an issue.\n",
+      "\n",
+      "WARNING:tensorflow:From /home/ubuntu/bert/modeling.py:671: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use keras.layers.dense instead.\n",
+      "WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/nlp/lib/python3.6/site-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use standard file APIs to check for files with this prefix.\n",
+      "INFO:tensorflow:Restoring parameters from /home/ubuntu/.pytorch-pretrained-BERT-cache/bert-base-uncased/tf/bert_base_uncased.ckpt\n",
+      "Tensorflow embedding shape: (1, 768)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import tensorflow as tf\n",
+    "sys.path.insert(0, tf_bert_dir)\n",
+    "import modeling\n",
+    "import tokenization\n",
+    "\n",
+    "tf.reset_default_graph()\n",
+    "\n",
+    "# Process text\n",
+    "tf_tokenizer = tokenization.FullTokenizer(vocab_file=tf_vocab_file)\n",
+    "\n",
+    "# Graph inputs\n",
+    "input_ids_tf, mask_ids_tf, seg_ids_tf = tokenize(input_text, tf_tokenizer)\n",
+    "config = modeling.BertConfig.from_json_file(\n",
+    "    os.path.join(tf_model_dir, 'bert_config.json'))\n",
+    "input_tensor = tf.placeholder(\n",
+    "    dtype=tf.int32,\n",
+    "    shape=[1, None],\n",
+    "    name='input_ids')\n",
+    "mask_tensor = tf.placeholder(\n",
+    "    dtype=tf.int32,\n",
+    "    shape=[1, None],\n",
+    "    name='mask_ids')\n",
+    "seg_tensor = tf.placeholder(\n",
+    "    dtype=tf.int32,\n",
+    "    shape=[1, None],\n",
+    "    name='seg_ids')\n",
+    "tf_model = modeling.BertModel(\n",
+    "    config=config,\n",
+    "    is_training=False,\n",
+    "    input_ids=input_tensor,\n",
+    "    input_mask=mask_tensor,\n",
+    "    token_type_ids=seg_tensor,\n",
+    "    use_one_hot_embeddings=False)\n",
+    "output_layer = tf_model.get_pooled_output()\n",
+    "\n",
+    "# Load tf model\n",
+    "session = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))\n",
+    "vars_to_load = [v for v in tf.global_variables()]\n",
+    "session.run(tf.variables_initializer(var_list=vars_to_load))\n",
+    "saver = tf.train.Saver(vars_to_load)\n",
+    "saver.restore(session, save_path=tf_init_ckpt)\n",
+    "\n",
+    "# TF Embedding\n",
+    "fetches = output_layer\n",
+    "feed_dict  = {\n",
+    "    input_tensor: [input_ids_tf],\n",
+    "    mask_tensor: [mask_ids_tf],\n",
+    "    seg_tensor: [seg_ids_tf]\n",
+    "}\n",
+    "tf_embedding = session.run(fetches=fetches, feed_dict=feed_dict)\n",
+    "print(\"Tensorflow embedding shape: {}\".format(tf_embedding.shape))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compare Tokenization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TOKEN_IDS_PT: [101, 3958, 27227, 2001, 1037, 13997, 11510, 102, 0, 0, 0, 0]\n",
+      "TOKEN_IDS_TF: [101, 3958, 27227, 2001, 1037, 13997, 11510, 102, 0, 0, 0, 0]\n",
+      "SEG_IDS_PT:   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
+      "SEG_IDS_TF:   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
+      "MASK_IDS_PT:  [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]\n",
+      "MASK_IDS_TF:  [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"TOKEN_IDS_PT: {}\".format(input_ids_pt))\n",
+    "print(\"TOKEN_IDS_TF: {}\".format(input_ids_tf))\n",
+    "print(\"SEG_IDS_PT:   {}\".format(seg_ids_pt))\n",
+    "print(\"SEG_IDS_TF:   {}\".format(seg_ids_tf))\n",
+    "print(\"MASK_IDS_PT:  {}\".format(mask_ids_pt))\n",
+    "print(\"MASK_IDS_TF:  {}\".format(mask_ids_tf))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compare Model Weights"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bert/embeddings/word_embeddings\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (30522, 768) values: [-0.01018257 -0.06154883 -0.02649689 -0.0420608   0.00116716]\n",
+      "TF: shape: (30522, 768) values: [-0.01018257 -0.06154883 -0.02649689 -0.0420608   0.00116716]\n",
+      "\n",
+      "bert/embeddings/token_type_embeddings\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (2, 768) values: [0.00043164 0.01098826 0.00370439 0.00150542 0.00057812]\n",
+      "TF: shape: (2, 768) values: [0.00043164 0.01098826 0.00370439 0.00150542 0.00057812]\n",
+      "\n",
+      "bert/embeddings/position_embeddings\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (512, 768) values: [ 0.01750538 -0.02563101 -0.03664156 -0.02528613  0.00797095]\n",
+      "TF: shape: (512, 768) values: [ 0.01750538 -0.02563101 -0.03664156 -0.02528613  0.00797095]\n",
+      "\n",
+      "bert/embeddings/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.02591471 -0.0195513   0.02423946  0.08904593 -0.06281059]\n",
+      "TF: shape: (768,) values: [-0.02591471 -0.0195513   0.02423946  0.08904593 -0.06281059]\n",
+      "\n",
+      "bert/embeddings/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.9260566  0.8851115  0.85807985 0.8616906  0.8937205 ]\n",
+      "TF: shape: (768,) values: [0.9260566  0.8851115  0.85807985 0.8616906  0.8937205 ]\n",
+      "\n",
+      "bert/encoder/layer_0/attention/self/query/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.01640572 -0.03257025  0.01046295 -0.04442816 -0.02256124]\n",
+      "TF: shape: (768, 768) values: [-0.01640572 -0.03257025  0.01046295 -0.04442816 -0.02256124]\n",
+      "\n",
+      "bert/encoder/layer_0/attention/self/query/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.58488506 -0.3312432  -0.43010172  0.37446147 -0.29811692]\n",
+      "TF: shape: (768,) values: [ 0.58488506 -0.3312432  -0.43010172  0.37446147 -0.29811692]\n",
+      "\n",
+      "bert/encoder/layer_0/attention/self/key/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.00807745  0.02652155 -0.01866494  0.01797846  0.00450485]\n",
+      "TF: shape: (768, 768) values: [ 0.00807745  0.02652155 -0.01866494  0.01797846  0.00450485]\n",
+      "\n",
+      "bert/encoder/layer_0/attention/self/key/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.00104306  0.00035106 -0.0024626  -0.00010567 -0.00119283]\n",
+      "TF: shape: (768,) values: [ 0.00104306  0.00035106 -0.0024626  -0.00010567 -0.00119283]\n",
+      "\n",
+      "bert/encoder/layer_0/attention/self/value/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.01144261 -0.02663044  0.01911472 -0.02206182 -0.00287949]\n",
+      "TF: shape: (768, 768) values: [ 0.01144261 -0.02663044  0.01911472 -0.02206182 -0.00287949]\n",
+      "\n",
+      "bert/encoder/layer_0/attention/self/value/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.01184616 -0.01596605 -0.00251847  0.01736802  0.00449983]\n",
+      "TF: shape: (768,) values: [-0.01184616 -0.01596605 -0.00251847  0.01736802  0.00449983]\n",
+      "\n",
+      "bert/encoder/layer_0/attention/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.00581949  0.03170148 -0.06135742 -0.01706108 -0.00759045]\n",
+      "TF: shape: (768, 768) values: [ 0.00581949  0.03170148 -0.06135742 -0.01706108 -0.00759045]\n",
+      "\n",
+      "bert/encoder/layer_0/attention/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.00511063 -0.0166625   0.02812938 -0.01166061  0.01942627]\n",
+      "TF: shape: (768,) values: [ 0.00511063 -0.0166625   0.02812938 -0.01166061  0.01942627]\n",
+      "\n",
+      "bert/encoder/layer_0/attention/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.25779155 -0.03077853 -0.2772697  -0.38847703  0.36841765]\n",
+      "TF: shape: (768,) values: [ 0.25779155 -0.03077853 -0.2772697  -0.38847703  0.36841765]\n",
+      "\n",
+      "bert/encoder/layer_0/attention/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.9803408  0.959969   0.96368986 0.9603653  0.9801324 ]\n",
+      "TF: shape: (768,) values: [0.9803408  0.959969   0.96368986 0.9603653  0.9801324 ]\n",
+      "\n",
+      "bert/encoder/layer_0/intermediate/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 3072) values: [-0.01010427 -0.060398   -0.01468864  0.00311493  0.02862451]\n",
+      "TF: shape: (768, 3072) values: [-0.01010427 -0.060398   -0.01468864  0.00311493  0.02862451]\n",
+      "\n",
+      "bert/encoder/layer_0/intermediate/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072,) values: [-0.11498757 -0.09629171 -0.12399033 -0.129036   -0.06369043]\n",
+      "TF: shape: (3072,) values: [-0.11498757 -0.09629171 -0.12399033 -0.129036   -0.06369043]\n",
+      "\n",
+      "bert/encoder/layer_0/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072, 768) values: [-0.03710171  0.0648794   0.00758566 -0.05224452 -0.04348791]\n",
+      "TF: shape: (3072, 768) values: [-0.03710171  0.0648794   0.00758566 -0.05224452 -0.04348791]\n",
+      "\n",
+      "bert/encoder/layer_0/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.04801027  0.19766568  0.02154854  0.02880666  0.0444298 ]\n",
+      "TF: shape: (768,) values: [-0.04801027  0.19766568  0.02154854  0.02880666  0.0444298 ]\n",
+      "\n",
+      "bert/encoder/layer_0/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.10142924 -0.00499344  0.04274083  0.09324206 -0.10700516]\n",
+      "TF: shape: (768,) values: [-0.10142924 -0.00499344  0.04274083  0.09324206 -0.10700516]\n",
+      "\n",
+      "bert/encoder/layer_0/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.7835125  0.8072406  0.7670588  0.73706394 0.76303864]\n",
+      "TF: shape: (768,) values: [0.7835125  0.8072406  0.7670588  0.73706394 0.76303864]\n",
+      "\n",
+      "bert/encoder/layer_1/attention/self/query/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.03132744 -0.01340016 -0.07761582  0.0655639  -0.00337808]\n",
+      "TF: shape: (768, 768) values: [ 0.03132744 -0.01340016 -0.07761582  0.0655639  -0.00337808]\n",
+      "\n",
+      "bert/encoder/layer_1/attention/self/query/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.27827993  0.17387655 -0.2497937  -0.8809636   0.41262135]\n",
+      "TF: shape: (768,) values: [-0.27827993  0.17387655 -0.2497937  -0.8809636   0.41262135]\n",
+      "\n",
+      "bert/encoder/layer_1/attention/self/key/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.03353037  0.04007257  0.05320328 -0.02166729 -0.03581231]\n",
+      "TF: shape: (768, 768) values: [-0.03353037  0.04007257  0.05320328 -0.02166729 -0.03581231]\n",
+      "\n",
+      "bert/encoder/layer_1/attention/self/key/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.00504407  0.00136887 -0.00394336  0.00646125 -0.00148919]\n",
+      "TF: shape: (768,) values: [-0.00504407  0.00136887 -0.00394336  0.00646125 -0.00148919]\n",
+      "\n",
+      "bert/encoder/layer_1/attention/self/value/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.00464159  0.06674305 -0.00970626 -0.0276653  -0.01597566]\n",
+      "TF: shape: (768, 768) values: [-0.00464159  0.06674305 -0.00970626 -0.0276653  -0.01597566]\n",
+      "\n",
+      "bert/encoder/layer_1/attention/self/value/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.00381288  0.02650839 -0.0059689  -0.00508269 -0.01293722]\n",
+      "TF: shape: (768,) values: [ 0.00381288  0.02650839 -0.0059689  -0.00508269 -0.01293722]\n",
+      "\n",
+      "bert/encoder/layer_1/attention/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.01390745 -0.01100563  0.01303005 -0.01969771  0.0125082 ]\n",
+      "TF: shape: (768, 768) values: [-0.01390745 -0.01100563  0.01303005 -0.01969771  0.0125082 ]\n",
+      "\n",
+      "bert/encoder/layer_1/attention/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.02946591 0.05715097 0.01293636 0.01920356 0.00805334]\n",
+      "TF: shape: (768,) values: [0.02946591 0.05715097 0.01293636 0.01920356 0.00805334]\n",
+      "\n",
+      "bert/encoder/layer_1/attention/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.08583715  0.14199966 -0.0856637  -0.18797271  0.21056814]\n",
+      "TF: shape: (768,) values: [ 0.08583715  0.14199966 -0.0856637  -0.18797271  0.21056814]\n",
+      "\n",
+      "bert/encoder/layer_1/attention/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.896962   0.87148863 0.8531161  0.8690647  0.9488987 ]\n",
+      "TF: shape: (768,) values: [0.896962   0.87148863 0.8531161  0.8690647  0.9488987 ]\n",
+      "\n",
+      "bert/encoder/layer_1/intermediate/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 3072) values: [ 0.01841293 -0.02650284 -0.09708428 -0.01734244 -0.05529237]\n",
+      "TF: shape: (768, 3072) values: [ 0.01841293 -0.02650284 -0.09708428 -0.01734244 -0.05529237]\n",
+      "\n",
+      "bert/encoder/layer_1/intermediate/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072,) values: [-0.15203774 -0.10449131 -0.08440229 -0.09323178 -0.08511415]\n",
+      "TF: shape: (3072,) values: [-0.15203774 -0.10449131 -0.08440229 -0.09323178 -0.08511415]\n",
+      "\n",
+      "bert/encoder/layer_1/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072, 768) values: [-0.02372648  0.03326349  0.08291997 -0.01519038  0.01868557]\n",
+      "TF: shape: (3072, 768) values: [-0.02372648  0.03326349  0.08291997 -0.01519038  0.01868557]\n",
+      "\n",
+      "bert/encoder/layer_1/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.02514724  0.09868994 -0.027811    0.03749462  0.01086514]\n",
+      "TF: shape: (768,) values: [-0.02514724  0.09868994 -0.027811    0.03749462  0.01086514]\n",
+      "\n",
+      "bert/encoder/layer_1/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.07662535 -0.10506564  0.03191236  0.07633785 -0.11187791]\n",
+      "TF: shape: (768,) values: [-0.07662535 -0.10506564  0.03191236  0.07633785 -0.11187791]\n",
+      "\n",
+      "bert/encoder/layer_1/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.9017883  0.8868776  0.8862677  0.85865664 0.87496454]\n",
+      "TF: shape: (768,) values: [0.9017883  0.8868776  0.8862677  0.85865664 0.87496454]\n",
+      "\n",
+      "bert/encoder/layer_2/attention/self/query/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.08433672  0.09580533  0.07543895 -0.01126779 -0.01354045]\n",
+      "TF: shape: (768, 768) values: [ 0.08433672  0.09580533  0.07543895 -0.01126779 -0.01354045]\n",
+      "\n",
+      "bert/encoder/layer_2/attention/self/query/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.0371241   0.03406003  0.27713948 -0.21613775 -0.05275448]\n",
+      "TF: shape: (768,) values: [ 0.0371241   0.03406003  0.27713948 -0.21613775 -0.05275448]\n",
+      "\n",
+      "bert/encoder/layer_2/attention/self/key/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.04794507  0.02517631 -0.01319554 -0.02094732  0.09073472]\n",
+      "TF: shape: (768, 768) values: [ 0.04794507  0.02517631 -0.01319554 -0.02094732  0.09073472]\n",
+      "\n",
+      "bert/encoder/layer_2/attention/self/key/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.00037404 -0.00125881 -0.00114734 -0.00157741  0.00037122]\n",
+      "TF: shape: (768,) values: [-0.00037404 -0.00125881 -0.00114734 -0.00157741  0.00037122]\n",
+      "\n",
+      "bert/encoder/layer_2/attention/self/value/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.01119406 -0.01488636 -0.02960914  0.04746444  0.00428481]\n",
+      "TF: shape: (768, 768) values: [-0.01119406 -0.01488636 -0.02960914  0.04746444  0.00428481]\n",
+      "\n",
+      "bert/encoder/layer_2/attention/self/value/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.02728729  0.04979054  0.08326469  0.04150949  0.600959  ]\n",
+      "TF: shape: (768,) values: [-0.02728729  0.04979054  0.08326469  0.04150949  0.600959  ]\n",
+      "\n",
+      "bert/encoder/layer_2/attention/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.00517425  0.01197957  0.0393172  -0.0063884  -0.02673388]\n",
+      "TF: shape: (768, 768) values: [ 0.00517425  0.01197957  0.0393172  -0.0063884  -0.02673388]\n",
+      "\n",
+      "bert/encoder/layer_2/attention/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.01754025  0.1226335  -0.05733554  0.06844623  0.00879776]\n",
+      "TF: shape: (768,) values: [ 0.01754025  0.1226335  -0.05733554  0.06844623  0.00879776]\n",
+      "\n",
+      "bert/encoder/layer_2/attention/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.1490809   0.12386955 -0.19382021 -0.26515856  0.32723007]\n",
+      "TF: shape: (768,) values: [ 0.1490809   0.12386955 -0.19382021 -0.26515856  0.32723007]\n",
+      "\n",
+      "bert/encoder/layer_2/attention/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.8983343  0.88877076 0.86283594 0.8584952  0.9587886 ]\n",
+      "TF: shape: (768,) values: [0.8983343  0.88877076 0.86283594 0.8584952  0.9587886 ]\n",
+      "\n",
+      "bert/encoder/layer_2/intermediate/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 3072) values: [-0.01619919  0.00662888  0.01492284 -0.01280748  0.01318596]\n",
+      "TF: shape: (768, 3072) values: [-0.01619919  0.00662888  0.01492284 -0.01280748  0.01318596]\n",
+      "\n",
+      "bert/encoder/layer_2/intermediate/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072,) values: [-0.08474881 -0.12850781 -0.11550345 -0.09513011 -0.02519853]\n",
+      "TF: shape: (3072,) values: [-0.08474881 -0.12850781 -0.11550345 -0.09513011 -0.02519853]\n",
+      "\n",
+      "bert/encoder/layer_2/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072, 768) values: [-0.07225161 -0.0129784   0.00618811 -0.01593373 -0.02160194]\n",
+      "TF: shape: (3072, 768) values: [-0.07225161 -0.0129784   0.00618811 -0.01593373 -0.02160194]\n",
+      "\n",
+      "bert/encoder/layer_2/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.06319264  0.06169628 -0.03041368  0.00924282  0.06277442]\n",
+      "TF: shape: (768,) values: [-0.06319264  0.06169628 -0.03041368  0.00924282  0.06277442]\n",
+      "\n",
+      "bert/encoder/layer_2/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.1139038  -0.11665309  0.07883061  0.07796711 -0.14219187]\n",
+      "TF: shape: (768,) values: [-0.1139038  -0.11665309  0.07883061  0.07796711 -0.14219187]\n",
+      "\n",
+      "bert/encoder/layer_2/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.8813261  0.85744697 0.8511922  0.85261875 0.8329574 ]\n",
+      "TF: shape: (768,) values: [0.8813261  0.85744697 0.8511922  0.85261875 0.8329574 ]\n",
+      "\n",
+      "bert/encoder/layer_3/attention/self/query/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.05855456 -0.00111438 -0.00828963  0.04117409 -0.07591715]\n",
+      "TF: shape: (768, 768) values: [ 0.05855456 -0.00111438 -0.00828963  0.04117409 -0.07591715]\n",
+      "\n",
+      "bert/encoder/layer_3/attention/self/query/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.09740101 -0.19290674  0.04332267  0.17937997 -0.08023558]\n",
+      "TF: shape: (768,) values: [ 0.09740101 -0.19290674  0.04332267  0.17937997 -0.08023558]\n",
+      "\n",
+      "bert/encoder/layer_3/attention/self/key/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.02562077  0.02507281 -0.03361562  0.05613289 -0.05435724]\n",
+      "TF: shape: (768, 768) values: [ 0.02562077  0.02507281 -0.03361562  0.05613289 -0.05435724]\n",
+      "\n",
+      "bert/encoder/layer_3/attention/self/key/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.00188639 -0.00379197 -0.01020415  0.00969649 -0.00094182]\n",
+      "TF: shape: (768,) values: [ 0.00188639 -0.00379197 -0.01020415  0.00969649 -0.00094182]\n",
+      "\n",
+      "bert/encoder/layer_3/attention/self/value/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.00539032  0.00959642  0.01325458  0.00490616  0.0129908 ]\n",
+      "TF: shape: (768, 768) values: [-0.00539032  0.00959642  0.01325458  0.00490616  0.0129908 ]\n",
+      "\n",
+      "bert/encoder/layer_3/attention/self/value/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.04573824 0.05405985 0.00681163 0.00655945 0.01141771]\n",
+      "TF: shape: (768,) values: [0.04573824 0.05405985 0.00681163 0.00655945 0.01141771]\n",
+      "\n",
+      "bert/encoder/layer_3/attention/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.01850341  0.03148198  0.02705758 -0.0004669   0.01367511]\n",
+      "TF: shape: (768, 768) values: [ 0.01850341  0.03148198  0.02705758 -0.0004669   0.01367511]\n",
+      "\n",
+      "bert/encoder/layer_3/attention/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.01981483  0.03566506 -0.05016088  0.02958186  0.04989756]\n",
+      "TF: shape: (768,) values: [ 0.01981483  0.03566506 -0.05016088  0.02958186  0.04989756]\n",
+      "\n",
+      "bert/encoder/layer_3/attention/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.09815404  0.00063774 -0.01257733 -0.26485074  0.22568701]\n",
+      "TF: shape: (768,) values: [ 0.09815404  0.00063774 -0.01257733 -0.26485074  0.22568701]\n",
+      "\n",
+      "bert/encoder/layer_3/attention/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.91457725 0.88453823 0.8340887  0.84203583 0.95247847]\n",
+      "TF: shape: (768,) values: [0.91457725 0.88453823 0.8340887  0.84203583 0.95247847]\n",
+      "\n",
+      "bert/encoder/layer_3/intermediate/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 3072) values: [-0.02733567  0.03307878 -0.01331292 -0.00032527  0.03252084]\n",
+      "TF: shape: (768, 3072) values: [-0.02733567  0.03307878 -0.01331292 -0.00032527  0.03252084]\n",
+      "\n",
+      "bert/encoder/layer_3/intermediate/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072,) values: [-0.11436842 -0.15038085 -0.07842971  0.01335877 -0.09492484]\n",
+      "TF: shape: (3072,) values: [-0.11436842 -0.15038085 -0.07842971  0.01335877 -0.09492484]\n",
+      "\n",
+      "bert/encoder/layer_3/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072, 768) values: [-0.01751153  0.01631314 -0.02660011  0.03569947 -0.01394763]\n",
+      "TF: shape: (3072, 768) values: [-0.01751153  0.01631314 -0.02660011  0.03569947 -0.01394763]\n",
+      "\n",
+      "bert/encoder/layer_3/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.03873252  0.08414765 -0.0399323   0.01997361  0.12924597]\n",
+      "TF: shape: (768,) values: [-0.03873252  0.08414765 -0.0399323   0.01997361  0.12924597]\n",
+      "\n",
+      "bert/encoder/layer_3/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.08049371 -0.06923949 -0.03357155  0.05231095 -0.09717073]\n",
+      "TF: shape: (768,) values: [-0.08049371 -0.06923949 -0.03357155  0.05231095 -0.09717073]\n",
+      "\n",
+      "bert/encoder/layer_3/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.827748   0.83012533 0.82399255 0.81772    0.80794513]\n",
+      "TF: shape: (768,) values: [0.827748   0.83012533 0.82399255 0.81772    0.80794513]\n",
+      "\n",
+      "bert/encoder/layer_4/attention/self/query/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.08296382  0.02076941  0.06525186 -0.02659729  0.03491377]\n",
+      "TF: shape: (768, 768) values: [ 0.08296382  0.02076941  0.06525186 -0.02659729  0.03491377]\n",
+      "\n",
+      "bert/encoder/layer_4/attention/self/query/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.07045844 -0.13412629 -0.0514146   0.00061329  0.1248519 ]\n",
+      "TF: shape: (768,) values: [ 0.07045844 -0.13412629 -0.0514146   0.00061329  0.1248519 ]\n",
+      "\n",
+      "bert/encoder/layer_4/attention/self/key/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.06941643  0.08133814 -0.0453992   0.0668715  -0.06014847]\n",
+      "TF: shape: (768, 768) values: [ 0.06941643  0.08133814 -0.0453992   0.0668715  -0.06014847]\n",
+      "\n",
+      "bert/encoder/layer_4/attention/self/key/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.00588725 -0.00235185  0.00281131  0.00173088 -0.00546653]\n",
+      "TF: shape: (768,) values: [-0.00588725 -0.00235185  0.00281131  0.00173088 -0.00546653]\n",
+      "\n",
+      "bert/encoder/layer_4/attention/self/value/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.06889665  0.06645385  0.01232084  0.0132611  -0.01595679]\n",
+      "TF: shape: (768, 768) values: [ 0.06889665  0.06645385  0.01232084  0.0132611  -0.01595679]\n",
+      "\n",
+      "bert/encoder/layer_4/attention/self/value/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.01126871 -0.02704018  0.0301532   0.02332082 -0.04233487]\n",
+      "TF: shape: (768,) values: [-0.01126871 -0.02704018  0.0301532   0.02332082 -0.04233487]\n",
+      "\n",
+      "bert/encoder/layer_4/attention/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.02285513 -0.04172142 -0.0146292   0.04862929 -0.0442014 ]\n",
+      "TF: shape: (768, 768) values: [ 0.02285513 -0.04172142 -0.0146292   0.04862929 -0.0442014 ]\n",
+      "\n",
+      "bert/encoder/layer_4/attention/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.03054528  0.00479777 -0.02729505 -0.0325212  -0.00525727]\n",
+      "TF: shape: (768,) values: [ 0.03054528  0.00479777 -0.02729505 -0.0325212  -0.00525727]\n",
+      "\n",
+      "bert/encoder/layer_4/attention/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.00903359  0.0052285  -0.02841488 -0.22355485  0.28281343]\n",
+      "TF: shape: (768,) values: [ 0.00903359  0.0052285  -0.02841488 -0.22355485  0.28281343]\n",
+      "\n",
+      "bert/encoder/layer_4/attention/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.8849676  0.86927813 0.8114595  0.80269504 0.94864094]\n",
+      "TF: shape: (768,) values: [0.8849676  0.86927813 0.8114595  0.80269504 0.94864094]\n",
+      "\n",
+      "bert/encoder/layer_4/intermediate/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 3072) values: [-0.00639783  0.06198016 -0.03184223  0.00485356 -0.02453273]\n",
+      "TF: shape: (768, 3072) values: [-0.00639783  0.06198016 -0.03184223  0.00485356 -0.02453273]\n",
+      "\n",
+      "bert/encoder/layer_4/intermediate/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072,) values: [-0.08770327 -0.11779705 -0.11764182 -0.00192611 -0.1335473 ]\n",
+      "TF: shape: (3072,) values: [-0.08770327 -0.11779705 -0.11764182 -0.00192611 -0.1335473 ]\n",
+      "\n",
+      "bert/encoder/layer_4/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072, 768) values: [-0.05421264  0.0221118  -0.02674172  0.03672203 -0.02399626]\n",
+      "TF: shape: (3072, 768) values: [-0.05421264  0.0221118  -0.02674172  0.03672203 -0.02399626]\n",
+      "\n",
+      "bert/encoder/layer_4/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.05068972  0.04838871  0.01156022  0.05381602  0.08857913]\n",
+      "TF: shape: (768,) values: [-0.05068972  0.04838871  0.01156022  0.05381602  0.08857913]\n",
+      "\n",
+      "bert/encoder/layer_4/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.04338909 -0.0781464  -0.01518662  0.04936362 -0.12378412]\n",
+      "TF: shape: (768,) values: [-0.04338909 -0.0781464  -0.01518662  0.04936362 -0.12378412]\n",
+      "\n",
+      "bert/encoder/layer_4/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.8734387 0.8576282 0.8339444 0.8450325 0.8105372]\n",
+      "TF: shape: (768,) values: [0.8734387 0.8576282 0.8339444 0.8450325 0.8105372]\n",
+      "\n",
+      "bert/encoder/layer_5/attention/self/query/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.00858843 -0.03920127  0.02552994 -0.02786552  0.02436485]\n",
+      "TF: shape: (768, 768) values: [-0.00858843 -0.03920127  0.02552994 -0.02786552  0.02436485]\n",
+      "\n",
+      "bert/encoder/layer_5/attention/self/query/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.00859117 -0.01642405 -0.04391079  0.01085692  0.02925887]\n",
+      "TF: shape: (768,) values: [-0.00859117 -0.01642405 -0.04391079  0.01085692  0.02925887]\n",
+      "\n",
+      "bert/encoder/layer_5/attention/self/key/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.00352847  0.02330176 -0.00369894 -0.03904612  0.00294574]\n",
+      "TF: shape: (768, 768) values: [ 0.00352847  0.02330176 -0.00369894 -0.03904612  0.00294574]\n",
+      "\n",
+      "bert/encoder/layer_5/attention/self/key/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.01087186 -0.01176561  0.00016575 -0.01163023  0.00946616]\n",
+      "TF: shape: (768,) values: [-0.01087186 -0.01176561  0.00016575 -0.01163023  0.00946616]\n",
+      "\n",
+      "bert/encoder/layer_5/attention/self/value/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.06134222  0.04238288  0.02796064 -0.01284983  0.03683741]\n",
+      "TF: shape: (768, 768) values: [ 0.06134222  0.04238288  0.02796064 -0.01284983  0.03683741]\n",
+      "\n",
+      "bert/encoder/layer_5/attention/self/value/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.05061118 -0.02954445 -0.0034053  -0.00025261  0.0437019 ]\n",
+      "TF: shape: (768,) values: [ 0.05061118 -0.02954445 -0.0034053  -0.00025261  0.0437019 ]\n",
+      "\n",
+      "bert/encoder/layer_5/attention/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.00739815  0.0533964  -0.03736389 -0.04999201  0.01693069]\n",
+      "TF: shape: (768, 768) values: [-0.00739815  0.0533964  -0.03736389 -0.04999201  0.01693069]\n",
+      "\n",
+      "bert/encoder/layer_5/attention/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.0021682   0.01711399 -0.04201518  0.01605333  0.00552063]\n",
+      "TF: shape: (768,) values: [-0.0021682   0.01711399 -0.04201518  0.01605333  0.00552063]\n",
+      "\n",
+      "bert/encoder/layer_5/attention/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.06841327 -0.0146848   0.09792476 -0.23284538  0.2785602 ]\n",
+      "TF: shape: (768,) values: [-0.06841327 -0.0146848   0.09792476 -0.23284538  0.2785602 ]\n",
+      "\n",
+      "bert/encoder/layer_5/attention/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.8908311  0.87884724 0.81637293 0.8047641  0.96539867]\n",
+      "TF: shape: (768,) values: [0.8908311  0.87884724 0.81637293 0.8047641  0.96539867]\n",
+      "\n",
+      "bert/encoder/layer_5/intermediate/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 3072) values: [-0.03246041  0.07251058 -0.08201726  0.00772481  0.02532209]\n",
+      "TF: shape: (768, 3072) values: [-0.03246041  0.07251058 -0.08201726  0.00772481  0.02532209]\n",
+      "\n",
+      "bert/encoder/layer_5/intermediate/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072,) values: [-0.09689714 -0.27696273 -0.13047501 -0.10892326 -0.1057625 ]\n",
+      "TF: shape: (3072,) values: [-0.09689714 -0.27696273 -0.13047501 -0.10892326 -0.1057625 ]\n",
+      "\n",
+      "bert/encoder/layer_5/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072, 768) values: [ 0.0642072  -0.01738782 -0.05095377  0.00523853  0.04425264]\n",
+      "TF: shape: (3072, 768) values: [ 0.0642072  -0.01738782 -0.05095377  0.00523853  0.04425264]\n",
+      "\n",
+      "bert/encoder/layer_5/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.0007217   0.06006297  0.0016595   0.03848181  0.06703516]\n",
+      "TF: shape: (768,) values: [-0.0007217   0.06006297  0.0016595   0.03848181  0.06703516]\n",
+      "\n",
+      "bert/encoder/layer_5/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.00278729 -0.05594506 -0.0631047   0.06023621 -0.18672828]\n",
+      "TF: shape: (768,) values: [-0.00278729 -0.05594506 -0.0631047   0.06023621 -0.18672828]\n",
+      "\n",
+      "bert/encoder/layer_5/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.8621183  0.8515807  0.82654256 0.81729776 0.7985204 ]\n",
+      "TF: shape: (768,) values: [0.8621183  0.8515807  0.82654256 0.81729776 0.7985204 ]\n",
+      "\n",
+      "bert/encoder/layer_6/attention/self/query/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.02527807 -0.01429243  0.01467054  0.08624706 -0.00188593]\n",
+      "TF: shape: (768, 768) values: [-0.02527807 -0.01429243  0.01467054  0.08624706 -0.00188593]\n",
+      "\n",
+      "bert/encoder/layer_6/attention/self/query/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.17319514  0.27564248  0.16801168 -0.10946485  0.1643271 ]\n",
+      "TF: shape: (768,) values: [-0.17319514  0.27564248  0.16801168 -0.10946485  0.1643271 ]\n",
+      "\n",
+      "bert/encoder/layer_6/attention/self/key/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.05886372  0.00706217  0.0398422   0.00882155 -0.04571463]\n",
+      "TF: shape: (768, 768) values: [ 0.05886372  0.00706217  0.0398422   0.00882155 -0.04571463]\n",
+      "\n",
+      "bert/encoder/layer_6/attention/self/key/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.00424696 -0.0001192   0.0046079  -0.00315606  0.00434314]\n",
+      "TF: shape: (768,) values: [-0.00424696 -0.0001192   0.0046079  -0.00315606  0.00434314]\n",
+      "\n",
+      "bert/encoder/layer_6/attention/self/value/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.01720381  0.01170722  0.02346902 -0.02284313 -0.03173028]\n",
+      "TF: shape: (768, 768) values: [-0.01720381  0.01170722  0.02346902 -0.02284313 -0.03173028]\n",
+      "\n",
+      "bert/encoder/layer_6/attention/self/value/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.03492057  0.01813157 -0.00182878 -0.01420629 -0.00508944]\n",
+      "TF: shape: (768,) values: [-0.03492057  0.01813157 -0.00182878 -0.01420629 -0.00508944]\n",
+      "\n",
+      "bert/encoder/layer_6/attention/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.0323688  -0.00689882  0.07379091  0.01121114 -0.02059202]\n",
+      "TF: shape: (768, 768) values: [ 0.0323688  -0.00689882  0.07379091  0.01121114 -0.02059202]\n",
+      "\n",
+      "bert/encoder/layer_6/attention/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.00648672 -0.05935453 -0.05673229 -0.01152384 -0.02766573]\n",
+      "TF: shape: (768,) values: [-0.00648672 -0.05935453 -0.05673229 -0.01152384 -0.02766573]\n",
+      "\n",
+      "bert/encoder/layer_6/attention/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.06793639  0.03157783  0.15647687 -0.15025291  0.14727171]\n",
+      "TF: shape: (768,) values: [-0.06793639  0.03157783  0.15647687 -0.15025291  0.14727171]\n",
+      "\n",
+      "bert/encoder/layer_6/attention/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.8882361  0.8704905  0.80289173 0.77365315 0.92333615]\n",
+      "TF: shape: (768,) values: [0.8882361  0.8704905  0.80289173 0.77365315 0.92333615]\n",
+      "\n",
+      "bert/encoder/layer_6/intermediate/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 3072) values: [ 0.04492201  0.05160861  0.09041415 -0.00742628  0.048133  ]\n",
+      "TF: shape: (768, 3072) values: [ 0.04492201  0.05160861  0.09041415 -0.00742628  0.048133  ]\n",
+      "\n",
+      "bert/encoder/layer_6/intermediate/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072,) values: [-0.09301704 -0.158612   -0.10633879 -0.09706812 -0.17319229]\n",
+      "TF: shape: (3072,) values: [-0.09301704 -0.158612   -0.10633879 -0.09706812 -0.17319229]\n",
+      "\n",
+      "bert/encoder/layer_6/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072, 768) values: [-0.00085372 -0.00974195  0.00684915  0.00038686  0.06610142]\n",
+      "TF: shape: (3072, 768) values: [-0.00085372 -0.00974195  0.00684915  0.00038686  0.06610142]\n",
+      "\n",
+      "bert/encoder/layer_6/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.03254414  0.05681704  0.03720434  0.01936359  0.09134153]\n",
+      "TF: shape: (768,) values: [-0.03254414  0.05681704  0.03720434  0.01936359  0.09134153]\n",
+      "\n",
+      "bert/encoder/layer_6/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.0117129  -0.03209404 -0.08646043  0.03760341 -0.13841423]\n",
+      "TF: shape: (768,) values: [-0.0117129  -0.03209404 -0.08646043  0.03760341 -0.13841423]\n",
+      "\n",
+      "bert/encoder/layer_6/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.8674175  0.8657014  0.8151861  0.82301307 0.8305737 ]\n",
+      "TF: shape: (768,) values: [0.8674175  0.8657014  0.8151861  0.82301307 0.8305737 ]\n",
+      "\n",
+      "bert/encoder/layer_7/attention/self/query/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.00075523 -0.01501983  0.04090893  0.01884826  0.04670674]\n",
+      "TF: shape: (768, 768) values: [-0.00075523 -0.01501983  0.04090893  0.01884826  0.04670674]\n",
+      "\n",
+      "bert/encoder/layer_7/attention/self/query/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.0010344  -0.00423982  0.3117479   0.04494623 -0.01260845]\n",
+      "TF: shape: (768,) values: [ 0.0010344  -0.00423982  0.3117479   0.04494623 -0.01260845]\n",
+      "\n",
+      "bert/encoder/layer_7/attention/self/key/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.02781927 -0.00906972  0.02121989  0.0298591   0.05854786]\n",
+      "TF: shape: (768, 768) values: [ 0.02781927 -0.00906972  0.02121989  0.0298591   0.05854786]\n",
+      "\n",
+      "bert/encoder/layer_7/attention/self/key/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.00074918  0.00731079  0.00089338  0.00345652  0.00043817]\n",
+      "TF: shape: (768,) values: [-0.00074918  0.00731079  0.00089338  0.00345652  0.00043817]\n",
+      "\n",
+      "bert/encoder/layer_7/attention/self/value/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.01080035 -0.03468366  0.03167168  0.01583073  0.0327719 ]\n",
+      "TF: shape: (768, 768) values: [-0.01080035 -0.03468366  0.03167168  0.01583073  0.0327719 ]\n",
+      "\n",
+      "bert/encoder/layer_7/attention/self/value/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.02824226  0.01605172  0.00067929 -0.04553111  0.0076044 ]\n",
+      "TF: shape: (768,) values: [-0.02824226  0.01605172  0.00067929 -0.04553111  0.0076044 ]\n",
+      "\n",
+      "bert/encoder/layer_7/attention/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.05496112  0.01006968  0.02206531 -0.01873116  0.02149118]\n",
+      "TF: shape: (768, 768) values: [-0.05496112  0.01006968  0.02206531 -0.01873116  0.02149118]\n",
+      "\n",
+      "bert/encoder/layer_7/attention/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.00349772 -0.05831751 -0.0594084  -0.0342187   0.02965918]\n",
+      "TF: shape: (768,) values: [ 0.00349772 -0.05831751 -0.0594084  -0.0342187   0.02965918]\n",
+      "\n",
+      "bert/encoder/layer_7/attention/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.02826844  0.04427591  0.05678326 -0.0475907   0.16136196]\n",
+      "TF: shape: (768,) values: [-0.02826844  0.04427591  0.05678326 -0.0475907   0.16136196]\n",
+      "\n",
+      "bert/encoder/layer_7/attention/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.8742141  0.870608   0.79147685 0.7595279  0.9223656 ]\n",
+      "TF: shape: (768,) values: [0.8742141  0.870608   0.79147685 0.7595279  0.9223656 ]\n",
+      "\n",
+      "bert/encoder/layer_7/intermediate/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 3072) values: [ 0.03598932 -0.12225644  0.03019998  0.05691092  0.03717208]\n",
+      "TF: shape: (768, 3072) values: [ 0.03598932 -0.12225644  0.03019998  0.05691092  0.03717208]\n",
+      "\n",
+      "bert/encoder/layer_7/intermediate/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072,) values: [-0.12465011 -0.08639494 -0.06206005 -0.08012587 -0.08773767]\n",
+      "TF: shape: (3072,) values: [-0.12465011 -0.08639494 -0.06206005 -0.08012587 -0.08773767]\n",
+      "\n",
+      "bert/encoder/layer_7/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072, 768) values: [-0.02190432 -0.02279165  0.03279508  0.01011065 -0.07793335]\n",
+      "TF: shape: (3072, 768) values: [-0.02190432 -0.02279165  0.03279508  0.01011065 -0.07793335]\n",
+      "\n",
+      "bert/encoder/layer_7/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.04282642  0.03700675  0.06142357 -0.04787201  0.02958163]\n",
+      "TF: shape: (768,) values: [-0.04282642  0.03700675  0.06142357 -0.04787201  0.02958163]\n",
+      "\n",
+      "bert/encoder/layer_7/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.03142036 -0.04358427 -0.05132087 -0.01788123 -0.16399944]\n",
+      "TF: shape: (768,) values: [-0.03142036 -0.04358427 -0.05132087 -0.01788123 -0.16399944]\n",
+      "\n",
+      "bert/encoder/layer_7/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.83858097 0.8179645  0.80693793 0.81225365 0.7844832 ]\n",
+      "TF: shape: (768,) values: [0.83858097 0.8179645  0.80693793 0.81225365 0.7844832 ]\n",
+      "\n",
+      "bert/encoder/layer_8/attention/self/query/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [0.0448719  0.02289526 0.03083764 0.03048073 0.02436891]\n",
+      "TF: shape: (768, 768) values: [0.0448719  0.02289526 0.03083764 0.03048073 0.02436891]\n",
+      "\n",
+      "bert/encoder/layer_8/attention/self/query/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.25132924 -0.23753347  0.02581017  0.00901509  0.18424493]\n",
+      "TF: shape: (768,) values: [-0.25132924 -0.23753347  0.02581017  0.00901509  0.18424493]\n",
+      "\n",
+      "bert/encoder/layer_8/attention/self/key/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.01999719  0.00711403  0.03949134 -0.0102224   0.03152475]\n",
+      "TF: shape: (768, 768) values: [-0.01999719  0.00711403  0.03949134 -0.0102224   0.03152475]\n",
+      "\n",
+      "bert/encoder/layer_8/attention/self/key/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 5.5668897e-05  3.4638541e-03 -1.7605867e-03 -6.1321147e-03\n",
+      " -4.4074579e-04]\n",
+      "TF: shape: (768,) values: [ 5.5668897e-05  3.4638541e-03 -1.7605867e-03 -6.1321147e-03\n",
+      " -4.4074579e-04]\n",
+      "\n",
+      "bert/encoder/layer_8/attention/self/value/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.00736056 -0.01795213  0.00104576 -0.00034653  0.03190543]\n",
+      "TF: shape: (768, 768) values: [-0.00736056 -0.01795213  0.00104576 -0.00034653  0.03190543]\n",
+      "\n",
+      "bert/encoder/layer_8/attention/self/value/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.02892835  0.00642501 -0.03608712  0.00264269 -0.0245198 ]\n",
+      "TF: shape: (768,) values: [ 0.02892835  0.00642501 -0.03608712  0.00264269 -0.0245198 ]\n",
+      "\n",
+      "bert/encoder/layer_8/attention/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.03971623  0.05307067 -0.01298818  0.00946693 -0.00121235]\n",
+      "TF: shape: (768, 768) values: [ 0.03971623  0.05307067 -0.01298818  0.00946693 -0.00121235]\n",
+      "\n",
+      "bert/encoder/layer_8/attention/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.01468131 -0.05406622 -0.06289103  0.004484    0.0240819 ]\n",
+      "TF: shape: (768,) values: [ 0.01468131 -0.05406622 -0.06289103  0.004484    0.0240819 ]\n",
+      "\n",
+      "bert/encoder/layer_8/attention/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.06004262  0.0457275   0.08688109 -0.14416659 -0.05500487]\n",
+      "TF: shape: (768,) values: [-0.06004262  0.0457275   0.08688109 -0.14416659 -0.05500487]\n",
+      "\n",
+      "bert/encoder/layer_8/attention/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.8907534  0.89116573 0.811639   0.7810443  0.9045574 ]\n",
+      "TF: shape: (768,) values: [0.8907534  0.89116573 0.811639   0.7810443  0.9045574 ]\n",
+      "\n",
+      "bert/encoder/layer_8/intermediate/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 3072) values: [-0.01962814 -0.01482586 -0.02292624  0.03397145  0.02457482]\n",
+      "TF: shape: (768, 3072) values: [-0.01962814 -0.01482586 -0.02292624  0.03397145  0.02457482]\n",
+      "\n",
+      "bert/encoder/layer_8/intermediate/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072,) values: [-0.08129632 -0.1691108  -0.10681771 -0.10392351 -0.13120006]\n",
+      "TF: shape: (3072,) values: [-0.08129632 -0.1691108  -0.10681771 -0.10392351 -0.13120006]\n",
+      "\n",
+      "bert/encoder/layer_8/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072, 768) values: [-0.04683433 -0.02690669  0.02979059  0.02223369 -0.00130287]\n",
+      "TF: shape: (3072, 768) values: [-0.04683433 -0.02690669  0.02979059  0.02223369 -0.00130287]\n",
+      "\n",
+      "bert/encoder/layer_8/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.09155537 -0.04465394  0.05649116 -0.09628641  0.11875238]\n",
+      "TF: shape: (768,) values: [-0.09155537 -0.04465394  0.05649116 -0.09628641  0.11875238]\n",
+      "\n",
+      "bert/encoder/layer_8/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.06043394 -0.06657387 -0.05341128 -0.00374733 -0.10855272]\n",
+      "TF: shape: (768,) values: [-0.06043394 -0.06657387 -0.05341128 -0.00374733 -0.10855272]\n",
+      "\n",
+      "bert/encoder/layer_8/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.84467345 0.84421015 0.82582206 0.84553087 0.8207573 ]\n",
+      "TF: shape: (768,) values: [0.84467345 0.84421015 0.82582206 0.84553087 0.8207573 ]\n",
+      "\n",
+      "bert/encoder/layer_9/attention/self/query/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.08004542 -0.0143706  -0.04219061 -0.05175152 -0.01147588]\n",
+      "TF: shape: (768, 768) values: [ 0.08004542 -0.0143706  -0.04219061 -0.05175152 -0.01147588]\n",
+      "\n",
+      "bert/encoder/layer_9/attention/self/query/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.14508031  0.40926442 -0.3281781  -0.02869792 -0.26104516]\n",
+      "TF: shape: (768,) values: [-0.14508031  0.40926442 -0.3281781  -0.02869792 -0.26104516]\n",
+      "\n",
+      "bert/encoder/layer_9/attention/self/key/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.01337681  0.00615428 -0.0455939   0.03379053 -0.01992556]\n",
+      "TF: shape: (768, 768) values: [-0.01337681  0.00615428 -0.0455939   0.03379053 -0.01992556]\n",
+      "\n",
+      "bert/encoder/layer_9/attention/self/key/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.0051302   0.0083288   0.00377641  0.00928865 -0.00418182]\n",
+      "TF: shape: (768,) values: [-0.0051302   0.0083288   0.00377641  0.00928865 -0.00418182]\n",
+      "\n",
+      "bert/encoder/layer_9/attention/self/value/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.02485976 -0.0301923   0.00984638 -0.02495162  0.01074037]\n",
+      "TF: shape: (768, 768) values: [-0.02485976 -0.0301923   0.00984638 -0.02495162  0.01074037]\n",
+      "\n",
+      "bert/encoder/layer_9/attention/self/value/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.04229928 -0.02636711  0.0060447   0.00222829  0.04979481]\n",
+      "TF: shape: (768,) values: [-0.04229928 -0.02636711  0.0060447   0.00222829  0.04979481]\n",
+      "\n",
+      "bert/encoder/layer_9/attention/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.01258144  0.00871274  0.00482882 -0.00675888 -0.04390825]\n",
+      "TF: shape: (768, 768) values: [-0.01258144  0.00871274  0.00482882 -0.00675888 -0.04390825]\n",
+      "\n",
+      "bert/encoder/layer_9/attention/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.02457753  0.05051134 -0.06890804 -0.00962795  0.00864793]\n",
+      "TF: shape: (768,) values: [ 0.02457753  0.05051134 -0.06890804 -0.00962795  0.00864793]\n",
+      "\n",
+      "bert/encoder/layer_9/attention/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.08963391 -0.06362236  0.0676669  -0.09895685  0.08318913]\n",
+      "TF: shape: (768,) values: [-0.08963391 -0.06362236  0.0676669  -0.09895685  0.08318913]\n",
+      "\n",
+      "bert/encoder/layer_9/attention/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.85100883 0.82569736 0.7927931  0.7660444  0.8912934 ]\n",
+      "TF: shape: (768,) values: [0.85100883 0.82569736 0.7927931  0.7660444  0.8912934 ]\n",
+      "\n",
+      "bert/encoder/layer_9/intermediate/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 3072) values: [ 0.06290598  0.0203122  -0.05384256  0.05442941  0.00484769]\n",
+      "TF: shape: (768, 3072) values: [ 0.06290598  0.0203122  -0.05384256  0.05442941  0.00484769]\n",
+      "\n",
+      "bert/encoder/layer_9/intermediate/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072,) values: [-0.10818483 -0.00169527 -0.08962701 -0.10280421 -0.14310956]\n",
+      "TF: shape: (3072,) values: [-0.10818483 -0.00169527 -0.08962701 -0.10280421 -0.14310956]\n",
+      "\n",
+      "bert/encoder/layer_9/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072, 768) values: [ 0.05487705  0.01644666  0.00436198 -0.00490768 -0.03238423]\n",
+      "TF: shape: (3072, 768) values: [ 0.05487705  0.01644666  0.00436198 -0.00490768 -0.03238423]\n",
+      "\n",
+      "bert/encoder/layer_9/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.08755219 -0.01910074 -0.02988298 -0.08150438  0.09897955]\n",
+      "TF: shape: (768,) values: [-0.08755219 -0.01910074 -0.02988298 -0.08150438  0.09897955]\n",
+      "\n",
+      "bert/encoder/layer_9/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.04136161 -0.02113917 -0.07581077 -0.00809791 -0.09790538]\n",
+      "TF: shape: (768,) values: [-0.04136161 -0.02113917 -0.07581077 -0.00809791 -0.09790538]\n",
+      "\n",
+      "bert/encoder/layer_9/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.8250572  0.83477134 0.7794141  0.81264955 0.7827918 ]\n",
+      "TF: shape: (768,) values: [0.8250572  0.83477134 0.7794141  0.81264955 0.7827918 ]\n",
+      "\n",
+      "bert/encoder/layer_10/attention/self/query/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.00071212 -0.00853064  0.01776993  0.03189976  0.02183623]\n",
+      "TF: shape: (768, 768) values: [ 0.00071212 -0.00853064  0.01776993  0.03189976  0.02183623]\n",
+      "\n",
+      "bert/encoder/layer_10/attention/self/query/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.03667567 -0.01449654 -0.03822913  0.00118343 -0.05489838]\n",
+      "TF: shape: (768,) values: [-0.03667567 -0.01449654 -0.03822913  0.00118343 -0.05489838]\n",
+      "\n",
+      "bert/encoder/layer_10/attention/self/key/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.0494106   0.05531096 -0.02459413 -0.06019118 -0.02829785]\n",
+      "TF: shape: (768, 768) values: [-0.0494106   0.05531096 -0.02459413 -0.06019118 -0.02829785]\n",
+      "\n",
+      "bert/encoder/layer_10/attention/self/key/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.00692997  0.00855893  0.00670777 -0.0052475  -0.00017074]\n",
+      "TF: shape: (768,) values: [-0.00692997  0.00855893  0.00670777 -0.0052475  -0.00017074]\n",
+      "\n",
+      "bert/encoder/layer_10/attention/self/value/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.01911842  0.04858809 -0.02608485  0.00794924 -0.02246636]\n",
+      "TF: shape: (768, 768) values: [ 0.01911842  0.04858809 -0.02608485  0.00794924 -0.02246636]\n",
+      "\n",
+      "bert/encoder/layer_10/attention/self/value/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.0133503  -0.01224133 -0.0051834  -0.00232528  0.00148614]\n",
+      "TF: shape: (768,) values: [-0.0133503  -0.01224133 -0.0051834  -0.00232528  0.00148614]\n",
+      "\n",
+      "bert/encoder/layer_10/attention/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.05904732  0.02616     0.00794104 -0.02889086 -0.03692576]\n",
+      "TF: shape: (768, 768) values: [-0.05904732  0.02616     0.00794104 -0.02889086 -0.03692576]\n",
+      "\n",
+      "bert/encoder/layer_10/attention/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.02089205 0.01458059 0.05217785 0.0324267  0.00907548]\n",
+      "TF: shape: (768,) values: [0.02089205 0.01458059 0.05217785 0.0324267  0.00907548]\n",
+      "\n",
+      "bert/encoder/layer_10/attention/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.10986238 -0.04332284  0.02603893 -0.06236923  0.14469369]\n",
+      "TF: shape: (768,) values: [-0.10986238 -0.04332284  0.02603893 -0.06236923  0.14469369]\n",
+      "\n",
+      "bert/encoder/layer_10/attention/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.8515822  0.81392974 0.836747   0.78040504 0.88091415]\n",
+      "TF: shape: (768,) values: [0.8515822  0.81392974 0.836747   0.78040504 0.88091415]\n",
+      "\n",
+      "bert/encoder/layer_10/intermediate/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 3072) values: [-0.07061081  0.06997397  0.01433633  0.04150929  0.02865192]\n",
+      "TF: shape: (768, 3072) values: [-0.07061081  0.06997397  0.01433633  0.04150929  0.02865192]\n",
+      "\n",
+      "bert/encoder/layer_10/intermediate/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072,) values: [-0.13879126 -0.06401426 -0.1408043  -0.15043251 -0.10193057]\n",
+      "TF: shape: (3072,) values: [-0.13879126 -0.06401426 -0.1408043  -0.15043251 -0.10193057]\n",
+      "\n",
+      "bert/encoder/layer_10/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072, 768) values: [ 0.02918765  0.02609882 -0.02259856  0.01636725 -0.00038442]\n",
+      "TF: shape: (3072, 768) values: [ 0.02918765  0.02609882 -0.02259856  0.01636725 -0.00038442]\n",
+      "\n",
+      "bert/encoder/layer_10/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.01799502  0.10970547 -0.02384165 -0.03350981  0.10491351]\n",
+      "TF: shape: (768,) values: [-0.01799502  0.10970547 -0.02384165 -0.03350981  0.10491351]\n",
+      "\n",
+      "bert/encoder/layer_10/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.00999107 -0.0217309  -0.0854177  -0.01109101 -0.07902174]\n",
+      "TF: shape: (768,) values: [ 0.00999107 -0.0217309  -0.0854177  -0.01109101 -0.07902174]\n",
+      "\n",
+      "bert/encoder/layer_10/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.8272796  0.8597452  0.79116803 0.81267637 0.8273501 ]\n",
+      "TF: shape: (768,) values: [0.8272796  0.8597452  0.79116803 0.81267637 0.8273501 ]\n",
+      "\n",
+      "bert/encoder/layer_11/attention/self/query/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.04141425 -0.06491017 -0.03202523  0.06226195  0.02193764]\n",
+      "TF: shape: (768, 768) values: [-0.04141425 -0.06491017 -0.03202523  0.06226195  0.02193764]\n",
+      "\n",
+      "bert/encoder/layer_11/attention/self/query/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.0501296   0.11886728  0.2186807   0.08720991 -0.20476632]\n",
+      "TF: shape: (768,) values: [ 0.0501296   0.11886728  0.2186807   0.08720991 -0.20476632]\n",
+      "\n",
+      "bert/encoder/layer_11/attention/self/key/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.02634268 -0.01357682 -0.06076496  0.04210597  0.01783857]\n",
+      "TF: shape: (768, 768) values: [ 0.02634268 -0.01357682 -0.06076496  0.04210597  0.01783857]\n",
+      "\n",
+      "bert/encoder/layer_11/attention/self/key/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.0007798  -0.00065806 -0.00010521  0.00119144 -0.00180091]\n",
+      "TF: shape: (768,) values: [-0.0007798  -0.00065806 -0.00010521  0.00119144 -0.00180091]\n",
+      "\n",
+      "bert/encoder/layer_11/attention/self/value/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.03520973 -0.00678078 -0.02883583 -0.01011515  0.04519828]\n",
+      "TF: shape: (768, 768) values: [ 0.03520973 -0.00678078 -0.02883583 -0.01011515  0.04519828]\n",
+      "\n",
+      "bert/encoder/layer_11/attention/self/value/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.01502306 -0.00530942  0.00023572  0.00205218 -0.00578036]\n",
+      "TF: shape: (768,) values: [ 0.01502306 -0.00530942  0.00023572  0.00205218 -0.00578036]\n",
+      "\n",
+      "bert/encoder/layer_11/attention/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [ 0.02361419  0.03112707 -0.00063031  0.04209773 -0.02434015]\n",
+      "TF: shape: (768, 768) values: [ 0.02361419  0.03112707 -0.00063031  0.04209773 -0.02434015]\n",
+      "\n",
+      "bert/encoder/layer_11/attention/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [ 0.02566087  0.0028438  -0.00475678  0.02149458 -0.01755187]\n",
+      "TF: shape: (768,) values: [ 0.02566087  0.0028438  -0.00475678  0.02149458 -0.01755187]\n",
+      "\n",
+      "bert/encoder/layer_11/attention/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.03134411  0.01207957 -0.04636396 -0.03013046  0.07944281]\n",
+      "TF: shape: (768,) values: [-0.03134411  0.01207957 -0.04636396 -0.03013046  0.07944281]\n",
+      "\n",
+      "bert/encoder/layer_11/attention/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.85203767 0.8020145  0.8554237  0.8150477  0.8441815 ]\n",
+      "TF: shape: (768,) values: [0.85203767 0.8020145  0.8554237  0.8150477  0.8441815 ]\n",
+      "\n",
+      "bert/encoder/layer_11/intermediate/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 3072) values: [ 0.05871898 -0.01124212  0.00206979 -0.04366514 -0.00716808]\n",
+      "TF: shape: (768, 3072) values: [ 0.05871898 -0.01124212  0.00206979 -0.04366514 -0.00716808]\n",
+      "\n",
+      "bert/encoder/layer_11/intermediate/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072,) values: [-0.09762521 -0.06175711 -0.05153917 -0.08580919 -0.09734315]\n",
+      "TF: shape: (3072,) values: [-0.09762521 -0.06175711 -0.05153917 -0.08580919 -0.09734315]\n",
+      "\n",
+      "bert/encoder/layer_11/output/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (3072, 768) values: [-0.022382    0.01073206 -0.01357213  0.02484621  0.01403091]\n",
+      "TF: shape: (3072, 768) values: [-0.022382    0.01073206 -0.01357213  0.02484621  0.01403091]\n",
+      "\n",
+      "bert/encoder/layer_11/output/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.06574099  0.04207807  0.01201084  0.00229322  0.05551811]\n",
+      "TF: shape: (768,) values: [-0.06574099  0.04207807  0.01201084  0.00229322  0.05551811]\n",
+      "\n",
+      "bert/encoder/layer_11/output/LayerNorm/beta\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.00634605 -0.01989403  0.04628465  0.01585056 -0.04256899]\n",
+      "TF: shape: (768,) values: [-0.00634605 -0.01989403  0.04628465  0.01585056 -0.04256899]\n",
+      "\n",
+      "bert/encoder/layer_11/output/LayerNorm/gamma\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [0.6384234  0.6300364  0.66570055 0.6126921  0.63756436]\n",
+      "TF: shape: (768,) values: [0.6384234  0.6300364  0.66570055 0.6126921  0.63756436]\n",
+      "\n",
+      "bert/pooler/dense/kernel\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768, 768) values: [-0.00127425  0.00199868 -0.03863145 -0.00139355  0.00691627]\n",
+      "TF: shape: (768, 768) values: [-0.00127425  0.00199868 -0.03863145 -0.00139355  0.00691627]\n",
+      "\n",
+      "bert/pooler/dense/bias\n",
+      "|sum(pt_wts - tf_wts)| = 0.0\n",
+      "PT: shape: (768,) values: [-0.03597581 -0.00389536  0.05181352  0.02224747 -0.00493723]\n",
+      "TF: shape: (768,) values: [-0.03597581 -0.00389536  0.05181352  0.02224747 -0.00493723]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "tensors_to_transopse = (\n",
+    "    \"dense.weight\",\n",
+    "    \"attention.self.query\",\n",
+    "    \"attention.self.key\",\n",
+    "    \"attention.self.value\"\n",
+    ")\n",
+    "var_map = (\n",
+    "    ('layer.', 'layer_'),\n",
+    "    ('word_embeddings.weight', 'word_embeddings'),\n",
+    "    ('position_embeddings.weight', 'position_embeddings'),\n",
+    "    ('token_type_embeddings.weight', 'token_type_embeddings'),\n",
+    "    ('.', '/'),\n",
+    "    ('LayerNorm/weight', 'LayerNorm/gamma'),\n",
+    "    ('LayerNorm/bias', 'LayerNorm/beta'),\n",
+    "    ('weight', 'kernel')\n",
+    ")\n",
+    "\n",
+    "def to_tf_var_name(name:str):\n",
+    "    for patt, repl in iter(var_map):\n",
+    "        name = name.replace(patt, repl)\n",
+    "    return 'bert/{}'.format(name)\n",
+    "\n",
+    "tf_vars = {v.name: session.run(fetches=v) for v in tf.global_variables()}\n",
+    "pt_vars = {}\n",
+    "for v, T in pt_model.state_dict().items():\n",
+    "    T = T.detach().numpy()\n",
+    "    if any([x in v for x in tensors_to_transopse]):\n",
+    "        T = T.T\n",
+    "    pt_vars.update({to_tf_var_name(v): T})\n",
+    "\n",
+    "for var_name in tf_vars:\n",
+    "    \n",
+    "    pt = pt_vars[var_name.strip(\":0\")]\n",
+    "    tf = tf_vars[var_name]\n",
+    "\n",
+    "    print(var_name.strip(\":0\"))\n",
+    "    \n",
+    "    # Assert equivalence\n",
+    "    print(\"|sum(pt_wts - tf_wts)| = {}\".format(\n",
+    "        np.abs(np.sum(pt - tf, keepdims=False))\n",
+    "    ))\n",
+    "    assert not np.sum(pt - tf, keepdims=False)\n",
+    "    \n",
+    "    if len(pt.shape) == 2:\n",
+    "        print(\"PT: shape: {0} values: {1}\".format(pt.shape, pt[0, :5]))\n",
+    "        print(\"TF: shape: {0} values: {1}\".format(tf.shape, tf[0, :5]))\n",
+    "    else:\n",
+    "        print(\"PT: shape: {0} values: {1}\".format(pt.shape, pt[:5]))\n",
+    "        print(\"TF: shape: {0} values: {1}\".format(tf.shape, tf[:5]))\n",
+    "    print()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compare Layer-12 Projections"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MSE: 2.7155439966009e-05\n",
+      "PT-values: [-0.876663   -0.41088238 -0.12200808  0.44941     0.19445966]\n",
+      "TF-values: [-0.8742865  -0.40621698 -0.10585472  0.444904    0.1825743 ]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Mean Squared Error (MSE) between last projection of each model\n",
+    "MSE = np.mean((pt_embedding - tf_embedding) ** 2, keepdims=False)\n",
+    "print(\"MSE: {}\".format(MSE))\n",
+    "print(\"PT-values: {}\".format(pt_embedding[0, :5]))\n",
+    "print(\"TF-values: {}\".format(tf_embedding[0, :5]))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "nlp",
+   "language": "python",
+   "name": "nlp"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 716cc1c4d9c59bcce1cb13ba395c7d7bfb0df6a5 Mon Sep 17 00:00:00 2001
From: chrislarson1 <cl966@cornell.edu>
Date: Wed, 19 Jun 2019 23:18:57 -0400
Subject: [PATCH 13/13] added main() for programmatic call to convert
 pytorch->tf

---
 .../convert_pytorch_checkpoint_to_tf.py       | 55 +++++++++++--------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
index a9bfdaa45c..b8858ee3dc 100644
--- a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
+++ b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
@@ -17,16 +17,18 @@
 
 import os
 import argparse
+import torch
 import numpy as np
 import tensorflow as tf
-from pytorch_pretrained_bert.modeling import BertConfig, BertModel
+from pytorch_pretrained_bert.modeling import BertModel
 
 
-def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
+def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
 
     """
     :param model:BertModel Pytorch model instance to be converted
-    :param ckpt_dir: directory to save Tensorflow model
+    :param ckpt_dir: Tensorflow model directory
+    :param model_name: model name
     :return:
 
     Currently supported HF models:
@@ -87,35 +89,42 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str):
         print("{0}{1}initialized".format(tf_name, " " * (60 - len(tf_name))))
 
     saver = tf.train.Saver(tf_vars)
-    saver.save(session, os.path.join(ckpt_dir, args.pytorch_model_name))
+    saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
 
 
-if __name__ == "__main__":
-
+def main(raw_args=None):
     parser = argparse.ArgumentParser()
-    parser.add_argument("--pytorch_model_dir",
-                        default=None,
+    parser.add_argument("--model_name",
                         type=str,
+                        required=True,
+                        help="model name e.g. bert-base-uncased")
+    parser.add_argument("--cache_dir",
+                        type=str,
+                        default=None,
                         required=False,
                         help="Directory containing pytorch model")
-    parser.add_argument("--pytorch_model_name",
-                        default=None,
+    parser.add_argument("--pytorch_model_path",
                         type=str,
                         required=True,
-                        help="model name (e.g. bert-base-uncased)")
-    parser.add_argument("--config_file_path",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="Path to bert config file")
-    parser.add_argument("--tf_checkpoint_dir",
-                        default="",
+                        help="/path/to/<pytorch-model-name>.bin")
+    parser.add_argument("--tf_cache_dir",
                         type=str,
                         required=True,
                         help="Directory in which to save tensorflow model")
-    args = parser.parse_args()
+    args = parser.parse_args(raw_args)
+    
+    model = BertModel.from_pretrained(
+        pretrained_model_name_or_path=args.model_name,
+        state_dict=torch.load(args.pytorch_model_path),
+        cache_dir=args.cache_dir
+    )
+    
+    convert_pytorch_checkpoint_to_tf(
+        model=model,
+        ckpt_dir=args.tf_cache_dir,
+        model_name=args.model_name
+    )
 
-    model = BertModel(
-        config=BertConfig(args.config_file_path)
-    ).from_pretrained(args.pytorch_model_name, cache_dir=args.pytorch_model_dir)
-    convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_checkpoint_dir)
+
+if __name__ == "__main__":
+    main()