From 96c2b77f0f48cce12364d353c6bfd15a7ce002b4 Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 2 May 2019 13:14:25 -0400 Subject: [PATCH 01/13] added file to convert pytorch->tf --- .../convert_hf_checkpoint_to_tf.py | 153 ++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py diff --git a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py new file mode 100644 index 0000000000..98e497f6f5 --- /dev/null +++ b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py @@ -0,0 +1,153 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.""" + +import os +import argparse +import numpy as np +from pytorch_pretrained_bert.modeling import BertConfig, BertModel + + +# def __get_var_names(config): +# +# models = { +# 'BertModel': BertModel(config), +# 'BertForMaskedLM': BertForMaskedLM(config), +# 'BertForPreTraining': BertForPreTraining(config), +# 'BertForMultipleChoice': BertForMultipleChoice(config, num_choices=100), +# 'BertForNextSentencePrediction': BertForNextSentencePrediction(config), +# 'BertForSequenceClassification': BertForSequenceClassification(config, num_labels=100), +# 'BertForQuestionAnswering': BertForQuestionAnswering(config) +# } +# +# for name, model in models.items(): +# state_dict = model.state_dict() +# torch_vars = [] +# for var_ in state_dict: +# torch_vars.append(var_ + ', ' + str(tuple(state_dict[var_].shape))) +# json.dump(torch_vars, fp=open('torch_var_names_{}.json'.format(name), 'w'), indent=3) + + + +def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str): + + """ + :param model:BertModel Pytorch model instance to be converted + :param ckpt_dir: directory to save Tensorflow model + + Supported HF models: + Y BertModel + N BertForMaskedLM + N BertForPreTraining + N BertForMultipleChoice + N BertForNextSentencePrediction + N BertForSequenceClassification + N BertForQuestionAnswering + + Note: + TF isn't & shouldn't be a package-level requirement; this + feature is requested enough to warrant a local import. + """ + + import tensorflow as tf + + if not os.path.isdir(ckpt_dir): + os.makedirs(ckpt_dir) + + session = tf.Session() + + state_dict = model.state_dict() + + tf_vars = [] + + def to_tf_var_name(name:str): + + """todo: compile as regex""" + + name = name.replace('layer.', 'layer_') + name = name.replace('word_embeddings.weight', 'word_embeddings') + name = name.replace('position_embeddings.weight', 'position_embeddings') + name = name.replace('token_type_embeddings.weight', 'token_type_embeddings') + name = name.replace('.', '/') + name = name.replace('LayerNorm/weight', 'LayerNorm/gamma') + name = name.replace('LayerNorm/bias', 'LayerNorm/beta') + name = name.replace('weight', 'kernel') + return 'bert/{}'.format(name) + + def assign_tf_var(tensor:np.ndarray, name:str): + tmp_var = tf.Variable(initial_value=tensor) + tf_var = tf.get_variable(dtype=tmp_var.dtype, shape=tmp_var.shape, name=name) + op = tf.assign(ref=tf_var, value=tmp_var) + session.run(tf.variables_initializer([tmp_var, tf_var])) + session.run(fetches=[op, tf_var]) + return tf_var + + for var_name in state_dict: + + tf_name = to_tf_var_name(var_name) + torch_tensor = state_dict[var_name].numpy() + + if var_name.endswith('dense.weight'): + torch_tensor = torch_tensor.T + + tf_tensor = assign_tf_var(tensor=torch_tensor, name=tf_name) + + tf_vars.append(tf_tensor) + + print("{0}{1}initialized".format(tf_name, " " * (60 - len(tf_name)))) + + saver = tf.train.Saver(tf_vars) + saver.save(session, os.path.join(ckpt_dir, 'model')) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--pretrained_model_name_or_path", + default=None, + type=str, + required=True, + help="pretrained_model_name_or_path: either: \ + - a str with the name of a pre-trained model to load selected in the list of: \ + . `bert-base-uncased` \ + . `bert-large-uncased` \ + . `bert-base-cased` \ + . `bert-large-cased` \ + . `bert-base-multilingual-uncased` \ + . `bert-base-multilingual-cased` \ + . `bert-base-chinese` \ + - a path or url to a pretrained model archive containing: \ + . `bert_config.json` a configuration file for the model \ + . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance \ + - a path or url to a pretrained model archive containing: \ + . `bert_config.json` a configuration file for the model \ + . `model.ckpt` a TensorFlow checkpoint") + parser.add_argument("--config_file_path", + default=None, + type=str, + required=True, + help="Path to bert config file.") + parser.add_argument("--cache_dir", + default=None, + type=str, + required=True, + help="path to a folder in which the TF model will be cached.") + args = parser.parse_args() + + model = BertModel( + config=BertConfig(args.config_file_path) + ).from_pretrained(args.pretrained_model_name_or_path) + + convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.cache_) From 968c1b44cbaa36c17f6a1d453c10f125ffce64eb Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 2 May 2019 13:19:56 -0400 Subject: [PATCH 02/13] added file to convert pytorch->tf --- .../convert_hf_checkpoint_to_tf.py | 23 +------------------ 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py index 98e497f6f5..73c1f6587c 100644 --- a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py +++ b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py @@ -21,26 +21,6 @@ import numpy as np from pytorch_pretrained_bert.modeling import BertConfig, BertModel -# def __get_var_names(config): -# -# models = { -# 'BertModel': BertModel(config), -# 'BertForMaskedLM': BertForMaskedLM(config), -# 'BertForPreTraining': BertForPreTraining(config), -# 'BertForMultipleChoice': BertForMultipleChoice(config, num_choices=100), -# 'BertForNextSentencePrediction': BertForNextSentencePrediction(config), -# 'BertForSequenceClassification': BertForSequenceClassification(config, num_labels=100), -# 'BertForQuestionAnswering': BertForQuestionAnswering(config) -# } -# -# for name, model in models.items(): -# state_dict = model.state_dict() -# torch_vars = [] -# for var_ in state_dict: -# torch_vars.append(var_ + ', ' + str(tuple(state_dict[var_].shape))) -# json.dump(torch_vars, fp=open('torch_var_names_{}.json'.format(name), 'w'), indent=3) - - def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str): @@ -58,8 +38,7 @@ def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str): N BertForQuestionAnswering Note: - TF isn't & shouldn't be a package-level requirement; this - feature is requested enough to warrant a local import. + To keep TF out of package-level requirements, tf is imported locally. """ import tensorflow as tf From 0a8b4d65beed45d167735a3ecf8ee5d4a5d1b2a3 Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 2 May 2019 13:20:59 -0400 Subject: [PATCH 03/13] added file to convert pytorch->tf --- pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py index 73c1f6587c..a8f2e3f8d0 100644 --- a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py +++ b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py @@ -21,7 +21,6 @@ import numpy as np from pytorch_pretrained_bert.modeling import BertConfig, BertModel - def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str): """ @@ -129,4 +128,4 @@ if __name__ == "__main__": config=BertConfig(args.config_file_path) ).from_pretrained(args.pretrained_model_name_or_path) - convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.cache_) + convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.cache_) \ No newline at end of file From 41089bc7d339b30ca0542b3ed4096d37b7a6eec6 Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 2 May 2019 13:26:22 -0400 Subject: [PATCH 04/13] added file to convert pytorch->tf --- pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py index a8f2e3f8d0..44c860da15 100644 --- a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py +++ b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py @@ -121,11 +121,11 @@ if __name__ == "__main__": default=None, type=str, required=True, - help="path to a folder in which the TF model will be cached.") + help="Path to a folder in which the TF model will be cached.") args = parser.parse_args() model = BertModel( config=BertConfig(args.config_file_path) ).from_pretrained(args.pretrained_model_name_or_path) - convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.cache_) \ No newline at end of file + convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.cache_dir) \ No newline at end of file From 2bcda8d00c672ba402d8bc8a2b1a7e9079fac0e3 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 18 May 2019 15:55:11 -0400 Subject: [PATCH 05/13] update --- pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py index 44c860da15..8673c94196 100644 --- a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py +++ b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py @@ -37,7 +37,7 @@ def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str): N BertForQuestionAnswering Note: - To keep TF out of package-level requirements, tf is imported locally. + To keep tf out of package-level requirements, it's imported locally. """ import tensorflow as tf @@ -52,9 +52,7 @@ def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str): tf_vars = [] def to_tf_var_name(name:str): - """todo: compile as regex""" - name = name.replace('layer.', 'layer_') name = name.replace('word_embeddings.weight', 'word_embeddings') name = name.replace('position_embeddings.weight', 'position_embeddings') @@ -74,17 +72,12 @@ def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str): return tf_var for var_name in state_dict: - tf_name = to_tf_var_name(var_name) torch_tensor = state_dict[var_name].numpy() - if var_name.endswith('dense.weight'): torch_tensor = torch_tensor.T - tf_tensor = assign_tf_var(tensor=torch_tensor, name=tf_name) - tf_vars.append(tf_tensor) - print("{0}{1}initialized".format(tf_name, " " * (60 - len(tf_name)))) saver = tf.train.Saver(tf_vars) From f1433db4f16f8f485bd1352d581872d2fc4a0cc0 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 18 May 2019 17:09:08 -0400 Subject: [PATCH 06/13] update to hf->tf args --- .../convert_hf_checkpoint_to_tf.py | 53 +++++++------------ 1 file changed, 20 insertions(+), 33 deletions(-) diff --git a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py index 8673c94196..16b95f1454 100644 --- a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py +++ b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py @@ -18,16 +18,18 @@ import os import argparse import numpy as np +import tensorflow as tf from pytorch_pretrained_bert.modeling import BertConfig, BertModel -def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str): +def convert_hf_checkpoint_to_tf(model:type(BertModel), ckpt_dir:str): """ :param model:BertModel Pytorch model instance to be converted :param ckpt_dir: directory to save Tensorflow model + :return: - Supported HF models: + Currently supported HF models: Y BertModel N BertForMaskedLM N BertForPreTraining @@ -35,20 +37,13 @@ def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str): N BertForNextSentencePrediction N BertForSequenceClassification N BertForQuestionAnswering - - Note: - To keep tf out of package-level requirements, it's imported locally. """ - import tensorflow as tf - if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir) session = tf.Session() - state_dict = model.state_dict() - tf_vars = [] def to_tf_var_name(name:str): @@ -61,6 +56,7 @@ def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str): name = name.replace('LayerNorm/weight', 'LayerNorm/gamma') name = name.replace('LayerNorm/bias', 'LayerNorm/beta') name = name.replace('weight', 'kernel') + # name += ':0' return 'bert/{}'.format(name) def assign_tf_var(tensor:np.ndarray, name:str): @@ -81,44 +77,35 @@ def convert_hf_checkpoint_to_tf(model:BertModel, ckpt_dir:str): print("{0}{1}initialized".format(tf_name, " " * (60 - len(tf_name)))) saver = tf.train.Saver(tf_vars) - saver.save(session, os.path.join(ckpt_dir, 'model')) + saver.save(session, os.path.join(ckpt_dir, args.pytorch_model_name)) if __name__ == "__main__": + parser = argparse.ArgumentParser() - parser.add_argument("--pretrained_model_name_or_path", + parser.add_argument("--pytorch_model_dir", default=None, type=str, required=True, - help="pretrained_model_name_or_path: either: \ - - a str with the name of a pre-trained model to load selected in the list of: \ - . `bert-base-uncased` \ - . `bert-large-uncased` \ - . `bert-base-cased` \ - . `bert-large-cased` \ - . `bert-base-multilingual-uncased` \ - . `bert-base-multilingual-cased` \ - . `bert-base-chinese` \ - - a path or url to a pretrained model archive containing: \ - . `bert_config.json` a configuration file for the model \ - . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance \ - - a path or url to a pretrained model archive containing: \ - . `bert_config.json` a configuration file for the model \ - . `model.ckpt` a TensorFlow checkpoint") + help="Directory containing pytorch model") + parser.add_argument("--pytorch_model_name", + default=None, + type=str, + required=True, + help="model name (e.g. bert-base-uncased)") parser.add_argument("--config_file_path", default=None, type=str, required=True, - help="Path to bert config file.") - parser.add_argument("--cache_dir", - default=None, + help="Path to bert config file") + parser.add_argument("--tf_checkpoint_dir", + default="", type=str, required=True, - help="Path to a folder in which the TF model will be cached.") + help="Directory in which to save tensorflow model") args = parser.parse_args() model = BertModel( config=BertConfig(args.config_file_path) - ).from_pretrained(args.pretrained_model_name_or_path) - - convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.cache_dir) \ No newline at end of file + ).from_pretrained(args.pytorch_model_name, cache_dir=args.pytorch_model_dir) + convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.tf_checkpoint_dir) \ No newline at end of file From 69749f3fc330f954b31a47f51a177c80064aaa01 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 18 May 2019 17:16:01 -0400 Subject: [PATCH 07/13] update to hf->tf args --- pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py index 16b95f1454..41327de891 100644 --- a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py +++ b/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py @@ -108,4 +108,4 @@ if __name__ == "__main__": model = BertModel( config=BertConfig(args.config_file_path) ).from_pretrained(args.pytorch_model_name, cache_dir=args.pytorch_model_dir) - convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.tf_checkpoint_dir) \ No newline at end of file + convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.tf_checkpoint_dir) From a309459b92348f2a61458a464cf5eec3dd0994bc Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 22 May 2019 20:17:27 -0400 Subject: [PATCH 08/13] fn change; pytorch_model_dir required=False --- ...checkpoint_to_tf.py => convert_pytorch_checkpoint_to_tf.py} | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) rename pytorch_pretrained_bert/{convert_hf_checkpoint_to_tf.py => convert_pytorch_checkpoint_to_tf.py} (98%) diff --git a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py similarity index 98% rename from pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py rename to pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py index 41327de891..870b5ee5db 100644 --- a/pytorch_pretrained_bert/convert_hf_checkpoint_to_tf.py +++ b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py @@ -56,7 +56,6 @@ def convert_hf_checkpoint_to_tf(model:type(BertModel), ckpt_dir:str): name = name.replace('LayerNorm/weight', 'LayerNorm/gamma') name = name.replace('LayerNorm/bias', 'LayerNorm/beta') name = name.replace('weight', 'kernel') - # name += ':0' return 'bert/{}'.format(name) def assign_tf_var(tensor:np.ndarray, name:str): @@ -86,7 +85,7 @@ if __name__ == "__main__": parser.add_argument("--pytorch_model_dir", default=None, type=str, - required=True, + required=False, help="Directory containing pytorch model") parser.add_argument("--pytorch_model_name", default=None, From d0adab2c39dc486c548c0b61ad8471e27e60bd36 Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 22 May 2019 20:24:04 -0400 Subject: [PATCH 09/13] fn change; pytorch_model_dir required=False --- pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py index 870b5ee5db..a17d058664 100644 --- a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py +++ b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py @@ -22,7 +22,7 @@ import tensorflow as tf from pytorch_pretrained_bert.modeling import BertConfig, BertModel -def convert_hf_checkpoint_to_tf(model:type(BertModel), ckpt_dir:str): +def convert_pytorch_checkpoint_to_tf(model:type(BertModel), ckpt_dir:str): """ :param model:BertModel Pytorch model instance to be converted @@ -107,4 +107,4 @@ if __name__ == "__main__": model = BertModel( config=BertConfig(args.config_file_path) ).from_pretrained(args.pytorch_model_name, cache_dir=args.pytorch_model_dir) - convert_hf_checkpoint_to_tf(model=model, ckpt_dir=args.tf_checkpoint_dir) + convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_checkpoint_dir) From 8de1faea6fe5df0477afedc2112ae19d3c6dc4ee Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 22 May 2019 20:38:16 -0400 Subject: [PATCH 10/13] update to hf->tf args --- pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py index a17d058664..b845fa8530 100644 --- a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py +++ b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py @@ -22,7 +22,7 @@ import tensorflow as tf from pytorch_pretrained_bert.modeling import BertConfig, BertModel -def convert_pytorch_checkpoint_to_tf(model:type(BertModel), ckpt_dir:str): +def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str): """ :param model:BertModel Pytorch model instance to be converted From 314bc6bb4e4bd2ede16cd7c04b3b2a419611d190 Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 27 May 2019 09:47:59 -0400 Subject: [PATCH 11/13] added transposes to attention.self.[query,key,value] --- .../convert_pytorch_checkpoint_to_tf.py | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py index b845fa8530..a9bfdaa45c 100644 --- a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py +++ b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py @@ -39,6 +39,24 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str): N BertForQuestionAnswering """ + tensors_to_transopse = ( + "dense.weight", + "attention.self.query", + "attention.self.key", + "attention.self.value" + ) + + var_map = ( + ('layer.', 'layer_'), + ('word_embeddings.weight', 'word_embeddings'), + ('position_embeddings.weight', 'position_embeddings'), + ('token_type_embeddings.weight', 'token_type_embeddings'), + ('.', '/'), + ('LayerNorm/weight', 'LayerNorm/gamma'), + ('LayerNorm/bias', 'LayerNorm/beta'), + ('weight', 'kernel') + ) + if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir) @@ -47,15 +65,8 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str): tf_vars = [] def to_tf_var_name(name:str): - """todo: compile as regex""" - name = name.replace('layer.', 'layer_') - name = name.replace('word_embeddings.weight', 'word_embeddings') - name = name.replace('position_embeddings.weight', 'position_embeddings') - name = name.replace('token_type_embeddings.weight', 'token_type_embeddings') - name = name.replace('.', '/') - name = name.replace('LayerNorm/weight', 'LayerNorm/gamma') - name = name.replace('LayerNorm/bias', 'LayerNorm/beta') - name = name.replace('weight', 'kernel') + for patt, repl in iter(var_map): + name = name.replace(patt, repl) return 'bert/{}'.format(name) def assign_tf_var(tensor:np.ndarray, name:str): @@ -69,7 +80,7 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str): for var_name in state_dict: tf_name = to_tf_var_name(var_name) torch_tensor = state_dict[var_name].numpy() - if var_name.endswith('dense.weight'): + if any([x in var_name for x in tensors_to_transopse]): torch_tensor = torch_tensor.T tf_tensor = assign_tf_var(tensor=torch_tensor, name=tf_name) tf_vars.append(tf_tensor) From a8e071c6900b78ff14fbe14df75fd79ab86338fa Mon Sep 17 00:00:00 2001 From: chrislarson1 Date: Wed, 19 Jun 2019 23:08:08 -0400 Subject: [PATCH 12/13] added notebook to check correctness of the pytorch->tensorflow conversion --- notebooks/Comparing-PT-and-TF-models.ipynb | 1630 ++++++++++++++++++++ 1 file changed, 1630 insertions(+) create mode 100644 notebooks/Comparing-PT-and-TF-models.ipynb diff --git a/notebooks/Comparing-PT-and-TF-models.ipynb b/notebooks/Comparing-PT-and-TF-models.ipynb new file mode 100644 index 0000000000..321c2ebe30 --- /dev/null +++ b/notebooks/Comparing-PT-and-TF-models.ipynb @@ -0,0 +1,1630 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pytorch to Tensorflow Conversion Test Notebook\n", + "\n", + "To run this notebook follow these steps, modifying the **Config** section as necessary:\n", + "\n", + "1. Point `pt_model_dir` to your local directory containing the pytorch Bert model to be converted.\n", + "2. Point `tf_bert_dir` to your clone of Google's Bert implementation which can be found here: https://github.com/google-research/bert.\n", + "\n", + "Note: \n", + "1. This feature currently only supports the base BERT models (uncased/cased).\n", + "2. Tensorflow model will be dumped in `tf_model_dir`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Config" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "\n", + "model_cls = 'BertModel'\n", + "model_typ = 'bert-base-uncased'\n", + "token_cls = 'BertTokenizer'\n", + "max_seq = 12\n", + "CLS = \"[CLS]\"\n", + "SEP = \"[SEP]\"\n", + "MASK = \"[MASK]\"\n", + "CLS_IDX = 0\n", + "layer_idxs = tuple(range(12))\n", + "input_text = \"jim henson was a puppeteer\"\n", + "\n", + "pt_model_dir = \"/home/ubuntu/.pytorch-pretrained-BERT-cache/{}\".format(model_typ)\n", + "tf_bert_dir = \"/home/ubuntu/bert\"\n", + "\n", + "pt_vocab_file = os.path.join(pt_model_dir, \"vocab.txt\")\n", + "pt_init_ckpt = os.path.join(pt_model_dir, model_typ.replace(\"-\", \"_\") + \".bin\")\n", + "tf_model_dir = os.path.join(pt_model_dir, 'tf')\n", + "tf_vocab_file = os.path.join(tf_model_dir, \"vocab.txt\")\n", + "tf_init_ckpt = os.path.join(tf_model_dir, model_typ.replace(\"-\", \"_\") + \".ckpt\")\n", + "tf_config_file = os.path.join(tf_model_dir, \"bert_config.json\")\n", + "\n", + "if not os.path.isdir(tf_model_dir): \n", + " os.makedirs(tf_model_dir, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tokenization" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def tokenize(text, tokenizer):\n", + " text = text.strip().lower()\n", + " tok_ids = tokenizer.tokenize(text)\n", + " if len(tok_ids) > max_seq - 2:\n", + " tok_ids = tok_ids[:max_seq - 2]\n", + " tok_ids.insert(CLS_IDX, CLS)\n", + " tok_ids.append(SEP)\n", + " input_ids = tokenizer.convert_tokens_to_ids(tok_ids)\n", + " mask_ids = [1] * len(input_ids)\n", + " seg_ids = [0] * len(input_ids)\n", + " padding = [0] * (max_seq - len(input_ids))\n", + " input_ids += padding\n", + " mask_ids += padding\n", + " seg_ids += padding\n", + " return input_ids, mask_ids, seg_ids" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pytorch execution" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 231508/231508 [00:00<00:00, 41092464.26B/s]\n", + "100%|██████████| 407873900/407873900 [00:07<00:00, 58092479.52B/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pytorch embedding shape: (1, 768)\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import torch\n", + "from pytorch_pretrained_bert import (BertConfig,\n", + " BertModel, \n", + " BertTokenizer, \n", + " BertForSequenceClassification)\n", + "\n", + "# Save Vocab\n", + "pt_tokenizer = BertTokenizer.from_pretrained(\n", + " pretrained_model_name_or_path=model_typ, \n", + " cache_dir=pt_model_dir)\n", + "pt_tokenizer.save_vocabulary(pt_model_dir)\n", + "pt_tokenizer.save_vocabulary(tf_model_dir)\n", + "\n", + "# Save Model\n", + "pt_model = BertModel.from_pretrained(\n", + " pretrained_model_name_or_path=model_typ, \n", + " cache_dir=pt_model_dir).to('cpu')\n", + "pt_model.eval()\n", + "pt_model.config.hidden_dropout_prob = 0.0\n", + "pt_model.config.attention_probs_dropout_prob = 0.0\n", + "pt_model.config.to_json_file(tf_config_file)\n", + "torch.save(pt_model.state_dict(), pt_init_ckpt)\n", + "\n", + "# Inputs\n", + "input_ids_pt, mask_ids_pt, seg_ids_pt = tokenize(input_text, pt_tokenizer)\n", + "\n", + "# PT Embedding\n", + "tok_tensor = torch.tensor(input_ids_pt).to('cpu').unsqueeze(0)\n", + "seg_tensor = torch.tensor(seg_ids_pt).to('cpu').unsqueeze(0)\n", + "msk_tensor = torch.tensor(mask_ids_pt).to('cpu').unsqueeze(0)\n", + "attn_blks, nsp_logits = pt_model(tok_tensor, seg_tensor, msk_tensor)\n", + "pt_embedding = nsp_logits.detach().numpy() \n", + "print(\"Pytorch embedding shape: {}\".format(pt_embedding.shape))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pytorch → Tensorflow conversion" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/nlp/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Colocations handled automatically by placer.\n", + "bert/embeddings/word_embeddings initialized\n", + "bert/embeddings/position_embeddings initialized\n", + "bert/embeddings/token_type_embeddings initialized\n", + "bert/embeddings/LayerNorm/gamma initialized\n", + "bert/embeddings/LayerNorm/beta initialized\n", + "bert/encoder/layer_0/attention/self/query/kernel initialized\n", + "bert/encoder/layer_0/attention/self/query/bias initialized\n", + "bert/encoder/layer_0/attention/self/key/kernel initialized\n", + "bert/encoder/layer_0/attention/self/key/bias initialized\n", + "bert/encoder/layer_0/attention/self/value/kernel initialized\n", + "bert/encoder/layer_0/attention/self/value/bias initialized\n", + "bert/encoder/layer_0/attention/output/dense/kernel initialized\n", + "bert/encoder/layer_0/attention/output/dense/bias initialized\n", + "bert/encoder/layer_0/attention/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_0/attention/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_0/intermediate/dense/kernel initialized\n", + "bert/encoder/layer_0/intermediate/dense/bias initialized\n", + "bert/encoder/layer_0/output/dense/kernel initialized\n", + "bert/encoder/layer_0/output/dense/bias initialized\n", + "bert/encoder/layer_0/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_0/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_1/attention/self/query/kernel initialized\n", + "bert/encoder/layer_1/attention/self/query/bias initialized\n", + "bert/encoder/layer_1/attention/self/key/kernel initialized\n", + "bert/encoder/layer_1/attention/self/key/bias initialized\n", + "bert/encoder/layer_1/attention/self/value/kernel initialized\n", + "bert/encoder/layer_1/attention/self/value/bias initialized\n", + "bert/encoder/layer_1/attention/output/dense/kernel initialized\n", + "bert/encoder/layer_1/attention/output/dense/bias initialized\n", + "bert/encoder/layer_1/attention/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_1/attention/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_1/intermediate/dense/kernel initialized\n", + "bert/encoder/layer_1/intermediate/dense/bias initialized\n", + "bert/encoder/layer_1/output/dense/kernel initialized\n", + "bert/encoder/layer_1/output/dense/bias initialized\n", + "bert/encoder/layer_1/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_1/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_2/attention/self/query/kernel initialized\n", + "bert/encoder/layer_2/attention/self/query/bias initialized\n", + "bert/encoder/layer_2/attention/self/key/kernel initialized\n", + "bert/encoder/layer_2/attention/self/key/bias initialized\n", + "bert/encoder/layer_2/attention/self/value/kernel initialized\n", + "bert/encoder/layer_2/attention/self/value/bias initialized\n", + "bert/encoder/layer_2/attention/output/dense/kernel initialized\n", + "bert/encoder/layer_2/attention/output/dense/bias initialized\n", + "bert/encoder/layer_2/attention/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_2/attention/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_2/intermediate/dense/kernel initialized\n", + "bert/encoder/layer_2/intermediate/dense/bias initialized\n", + "bert/encoder/layer_2/output/dense/kernel initialized\n", + "bert/encoder/layer_2/output/dense/bias initialized\n", + "bert/encoder/layer_2/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_2/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_3/attention/self/query/kernel initialized\n", + "bert/encoder/layer_3/attention/self/query/bias initialized\n", + "bert/encoder/layer_3/attention/self/key/kernel initialized\n", + "bert/encoder/layer_3/attention/self/key/bias initialized\n", + "bert/encoder/layer_3/attention/self/value/kernel initialized\n", + "bert/encoder/layer_3/attention/self/value/bias initialized\n", + "bert/encoder/layer_3/attention/output/dense/kernel initialized\n", + "bert/encoder/layer_3/attention/output/dense/bias initialized\n", + "bert/encoder/layer_3/attention/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_3/attention/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_3/intermediate/dense/kernel initialized\n", + "bert/encoder/layer_3/intermediate/dense/bias initialized\n", + "bert/encoder/layer_3/output/dense/kernel initialized\n", + "bert/encoder/layer_3/output/dense/bias initialized\n", + "bert/encoder/layer_3/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_3/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_4/attention/self/query/kernel initialized\n", + "bert/encoder/layer_4/attention/self/query/bias initialized\n", + "bert/encoder/layer_4/attention/self/key/kernel initialized\n", + "bert/encoder/layer_4/attention/self/key/bias initialized\n", + "bert/encoder/layer_4/attention/self/value/kernel initialized\n", + "bert/encoder/layer_4/attention/self/value/bias initialized\n", + "bert/encoder/layer_4/attention/output/dense/kernel initialized\n", + "bert/encoder/layer_4/attention/output/dense/bias initialized\n", + "bert/encoder/layer_4/attention/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_4/attention/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_4/intermediate/dense/kernel initialized\n", + "bert/encoder/layer_4/intermediate/dense/bias initialized\n", + "bert/encoder/layer_4/output/dense/kernel initialized\n", + "bert/encoder/layer_4/output/dense/bias initialized\n", + "bert/encoder/layer_4/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_4/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_5/attention/self/query/kernel initialized\n", + "bert/encoder/layer_5/attention/self/query/bias initialized\n", + "bert/encoder/layer_5/attention/self/key/kernel initialized\n", + "bert/encoder/layer_5/attention/self/key/bias initialized\n", + "bert/encoder/layer_5/attention/self/value/kernel initialized\n", + "bert/encoder/layer_5/attention/self/value/bias initialized\n", + "bert/encoder/layer_5/attention/output/dense/kernel initialized\n", + "bert/encoder/layer_5/attention/output/dense/bias initialized\n", + "bert/encoder/layer_5/attention/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_5/attention/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_5/intermediate/dense/kernel initialized\n", + "bert/encoder/layer_5/intermediate/dense/bias initialized\n", + "bert/encoder/layer_5/output/dense/kernel initialized\n", + "bert/encoder/layer_5/output/dense/bias initialized\n", + "bert/encoder/layer_5/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_5/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_6/attention/self/query/kernel initialized\n", + "bert/encoder/layer_6/attention/self/query/bias initialized\n", + "bert/encoder/layer_6/attention/self/key/kernel initialized\n", + "bert/encoder/layer_6/attention/self/key/bias initialized\n", + "bert/encoder/layer_6/attention/self/value/kernel initialized\n", + "bert/encoder/layer_6/attention/self/value/bias initialized\n", + "bert/encoder/layer_6/attention/output/dense/kernel initialized\n", + "bert/encoder/layer_6/attention/output/dense/bias initialized\n", + "bert/encoder/layer_6/attention/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_6/attention/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_6/intermediate/dense/kernel initialized\n", + "bert/encoder/layer_6/intermediate/dense/bias initialized\n", + "bert/encoder/layer_6/output/dense/kernel initialized\n", + "bert/encoder/layer_6/output/dense/bias initialized\n", + "bert/encoder/layer_6/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_6/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_7/attention/self/query/kernel initialized\n", + "bert/encoder/layer_7/attention/self/query/bias initialized\n", + "bert/encoder/layer_7/attention/self/key/kernel initialized\n", + "bert/encoder/layer_7/attention/self/key/bias initialized\n", + "bert/encoder/layer_7/attention/self/value/kernel initialized\n", + "bert/encoder/layer_7/attention/self/value/bias initialized\n", + "bert/encoder/layer_7/attention/output/dense/kernel initialized\n", + "bert/encoder/layer_7/attention/output/dense/bias initialized\n", + "bert/encoder/layer_7/attention/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_7/attention/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_7/intermediate/dense/kernel initialized\n", + "bert/encoder/layer_7/intermediate/dense/bias initialized\n", + "bert/encoder/layer_7/output/dense/kernel initialized\n", + "bert/encoder/layer_7/output/dense/bias initialized\n", + "bert/encoder/layer_7/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_7/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_8/attention/self/query/kernel initialized\n", + "bert/encoder/layer_8/attention/self/query/bias initialized\n", + "bert/encoder/layer_8/attention/self/key/kernel initialized\n", + "bert/encoder/layer_8/attention/self/key/bias initialized\n", + "bert/encoder/layer_8/attention/self/value/kernel initialized\n", + "bert/encoder/layer_8/attention/self/value/bias initialized\n", + "bert/encoder/layer_8/attention/output/dense/kernel initialized\n", + "bert/encoder/layer_8/attention/output/dense/bias initialized\n", + "bert/encoder/layer_8/attention/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_8/attention/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_8/intermediate/dense/kernel initialized\n", + "bert/encoder/layer_8/intermediate/dense/bias initialized\n", + "bert/encoder/layer_8/output/dense/kernel initialized\n", + "bert/encoder/layer_8/output/dense/bias initialized\n", + "bert/encoder/layer_8/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_8/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_9/attention/self/query/kernel initialized\n", + "bert/encoder/layer_9/attention/self/query/bias initialized\n", + "bert/encoder/layer_9/attention/self/key/kernel initialized\n", + "bert/encoder/layer_9/attention/self/key/bias initialized\n", + "bert/encoder/layer_9/attention/self/value/kernel initialized\n", + "bert/encoder/layer_9/attention/self/value/bias initialized\n", + "bert/encoder/layer_9/attention/output/dense/kernel initialized\n", + "bert/encoder/layer_9/attention/output/dense/bias initialized\n", + "bert/encoder/layer_9/attention/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_9/attention/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_9/intermediate/dense/kernel initialized\n", + "bert/encoder/layer_9/intermediate/dense/bias initialized\n", + "bert/encoder/layer_9/output/dense/kernel initialized\n", + "bert/encoder/layer_9/output/dense/bias initialized\n", + "bert/encoder/layer_9/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_9/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_10/attention/self/query/kernel initialized\n", + "bert/encoder/layer_10/attention/self/query/bias initialized\n", + "bert/encoder/layer_10/attention/self/key/kernel initialized\n", + "bert/encoder/layer_10/attention/self/key/bias initialized\n", + "bert/encoder/layer_10/attention/self/value/kernel initialized\n", + "bert/encoder/layer_10/attention/self/value/bias initialized\n", + "bert/encoder/layer_10/attention/output/dense/kernel initialized\n", + "bert/encoder/layer_10/attention/output/dense/bias initialized\n", + "bert/encoder/layer_10/attention/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_10/attention/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_10/intermediate/dense/kernel initialized\n", + "bert/encoder/layer_10/intermediate/dense/bias initialized\n", + "bert/encoder/layer_10/output/dense/kernel initialized\n", + "bert/encoder/layer_10/output/dense/bias initialized\n", + "bert/encoder/layer_10/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_10/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_11/attention/self/query/kernel initialized\n", + "bert/encoder/layer_11/attention/self/query/bias initialized\n", + "bert/encoder/layer_11/attention/self/key/kernel initialized\n", + "bert/encoder/layer_11/attention/self/key/bias initialized\n", + "bert/encoder/layer_11/attention/self/value/kernel initialized\n", + "bert/encoder/layer_11/attention/self/value/bias initialized\n", + "bert/encoder/layer_11/attention/output/dense/kernel initialized\n", + "bert/encoder/layer_11/attention/output/dense/bias initialized\n", + "bert/encoder/layer_11/attention/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_11/attention/output/LayerNorm/beta initialized\n", + "bert/encoder/layer_11/intermediate/dense/kernel initialized\n", + "bert/encoder/layer_11/intermediate/dense/bias initialized\n", + "bert/encoder/layer_11/output/dense/kernel initialized\n", + "bert/encoder/layer_11/output/dense/bias initialized\n", + "bert/encoder/layer_11/output/LayerNorm/gamma initialized\n", + "bert/encoder/layer_11/output/LayerNorm/beta initialized\n", + "bert/pooler/dense/kernel initialized\n", + "bert/pooler/dense/bias initialized\n" + ] + } + ], + "source": [ + "from pytorch_pretrained_bert.convert_pytorch_checkpoint_to_tf import main\n", + "\n", + "main([\n", + " '--model_name', model_typ, \n", + " '--pytorch_model_path', pt_init_ckpt,\n", + " '--tf_cache_dir', tf_model_dir,\n", + " '--cache_dir', pt_model_dir\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tensorflow execution" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.\n", + "For more information, please see:\n", + " * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n", + " * https://github.com/tensorflow/addons\n", + "If you depend on functionality not listed there, please file an issue.\n", + "\n", + "WARNING:tensorflow:From /home/ubuntu/bert/modeling.py:671: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use keras.layers.dense instead.\n", + "WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/nlp/lib/python3.6/site-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use standard file APIs to check for files with this prefix.\n", + "INFO:tensorflow:Restoring parameters from /home/ubuntu/.pytorch-pretrained-BERT-cache/bert-base-uncased/tf/bert_base_uncased.ckpt\n", + "Tensorflow embedding shape: (1, 768)\n" + ] + } + ], + "source": [ + "import tensorflow as tf\n", + "sys.path.insert(0, tf_bert_dir)\n", + "import modeling\n", + "import tokenization\n", + "\n", + "tf.reset_default_graph()\n", + "\n", + "# Process text\n", + "tf_tokenizer = tokenization.FullTokenizer(vocab_file=tf_vocab_file)\n", + "\n", + "# Graph inputs\n", + "input_ids_tf, mask_ids_tf, seg_ids_tf = tokenize(input_text, tf_tokenizer)\n", + "config = modeling.BertConfig.from_json_file(\n", + " os.path.join(tf_model_dir, 'bert_config.json'))\n", + "input_tensor = tf.placeholder(\n", + " dtype=tf.int32,\n", + " shape=[1, None],\n", + " name='input_ids')\n", + "mask_tensor = tf.placeholder(\n", + " dtype=tf.int32,\n", + " shape=[1, None],\n", + " name='mask_ids')\n", + "seg_tensor = tf.placeholder(\n", + " dtype=tf.int32,\n", + " shape=[1, None],\n", + " name='seg_ids')\n", + "tf_model = modeling.BertModel(\n", + " config=config,\n", + " is_training=False,\n", + " input_ids=input_tensor,\n", + " input_mask=mask_tensor,\n", + " token_type_ids=seg_tensor,\n", + " use_one_hot_embeddings=False)\n", + "output_layer = tf_model.get_pooled_output()\n", + "\n", + "# Load tf model\n", + "session = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))\n", + "vars_to_load = [v for v in tf.global_variables()]\n", + "session.run(tf.variables_initializer(var_list=vars_to_load))\n", + "saver = tf.train.Saver(vars_to_load)\n", + "saver.restore(session, save_path=tf_init_ckpt)\n", + "\n", + "# TF Embedding\n", + "fetches = output_layer\n", + "feed_dict = {\n", + " input_tensor: [input_ids_tf],\n", + " mask_tensor: [mask_ids_tf],\n", + " seg_tensor: [seg_ids_tf]\n", + "}\n", + "tf_embedding = session.run(fetches=fetches, feed_dict=feed_dict)\n", + "print(\"Tensorflow embedding shape: {}\".format(tf_embedding.shape))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare Tokenization" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TOKEN_IDS_PT: [101, 3958, 27227, 2001, 1037, 13997, 11510, 102, 0, 0, 0, 0]\n", + "TOKEN_IDS_TF: [101, 3958, 27227, 2001, 1037, 13997, 11510, 102, 0, 0, 0, 0]\n", + "SEG_IDS_PT: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", + "SEG_IDS_TF: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", + "MASK_IDS_PT: [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]\n", + "MASK_IDS_TF: [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]\n" + ] + } + ], + "source": [ + "print(\"TOKEN_IDS_PT: {}\".format(input_ids_pt))\n", + "print(\"TOKEN_IDS_TF: {}\".format(input_ids_tf))\n", + "print(\"SEG_IDS_PT: {}\".format(seg_ids_pt))\n", + "print(\"SEG_IDS_TF: {}\".format(seg_ids_tf))\n", + "print(\"MASK_IDS_PT: {}\".format(mask_ids_pt))\n", + "print(\"MASK_IDS_TF: {}\".format(mask_ids_tf))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare Model Weights" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "bert/embeddings/word_embeddings\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (30522, 768) values: [-0.01018257 -0.06154883 -0.02649689 -0.0420608 0.00116716]\n", + "TF: shape: (30522, 768) values: [-0.01018257 -0.06154883 -0.02649689 -0.0420608 0.00116716]\n", + "\n", + "bert/embeddings/token_type_embeddings\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (2, 768) values: [0.00043164 0.01098826 0.00370439 0.00150542 0.00057812]\n", + "TF: shape: (2, 768) values: [0.00043164 0.01098826 0.00370439 0.00150542 0.00057812]\n", + "\n", + "bert/embeddings/position_embeddings\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (512, 768) values: [ 0.01750538 -0.02563101 -0.03664156 -0.02528613 0.00797095]\n", + "TF: shape: (512, 768) values: [ 0.01750538 -0.02563101 -0.03664156 -0.02528613 0.00797095]\n", + "\n", + "bert/embeddings/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.02591471 -0.0195513 0.02423946 0.08904593 -0.06281059]\n", + "TF: shape: (768,) values: [-0.02591471 -0.0195513 0.02423946 0.08904593 -0.06281059]\n", + "\n", + "bert/embeddings/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.9260566 0.8851115 0.85807985 0.8616906 0.8937205 ]\n", + "TF: shape: (768,) values: [0.9260566 0.8851115 0.85807985 0.8616906 0.8937205 ]\n", + "\n", + "bert/encoder/layer_0/attention/self/query/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.01640572 -0.03257025 0.01046295 -0.04442816 -0.02256124]\n", + "TF: shape: (768, 768) values: [-0.01640572 -0.03257025 0.01046295 -0.04442816 -0.02256124]\n", + "\n", + "bert/encoder/layer_0/attention/self/query/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.58488506 -0.3312432 -0.43010172 0.37446147 -0.29811692]\n", + "TF: shape: (768,) values: [ 0.58488506 -0.3312432 -0.43010172 0.37446147 -0.29811692]\n", + "\n", + "bert/encoder/layer_0/attention/self/key/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.00807745 0.02652155 -0.01866494 0.01797846 0.00450485]\n", + "TF: shape: (768, 768) values: [ 0.00807745 0.02652155 -0.01866494 0.01797846 0.00450485]\n", + "\n", + "bert/encoder/layer_0/attention/self/key/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.00104306 0.00035106 -0.0024626 -0.00010567 -0.00119283]\n", + "TF: shape: (768,) values: [ 0.00104306 0.00035106 -0.0024626 -0.00010567 -0.00119283]\n", + "\n", + "bert/encoder/layer_0/attention/self/value/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.01144261 -0.02663044 0.01911472 -0.02206182 -0.00287949]\n", + "TF: shape: (768, 768) values: [ 0.01144261 -0.02663044 0.01911472 -0.02206182 -0.00287949]\n", + "\n", + "bert/encoder/layer_0/attention/self/value/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.01184616 -0.01596605 -0.00251847 0.01736802 0.00449983]\n", + "TF: shape: (768,) values: [-0.01184616 -0.01596605 -0.00251847 0.01736802 0.00449983]\n", + "\n", + "bert/encoder/layer_0/attention/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.00581949 0.03170148 -0.06135742 -0.01706108 -0.00759045]\n", + "TF: shape: (768, 768) values: [ 0.00581949 0.03170148 -0.06135742 -0.01706108 -0.00759045]\n", + "\n", + "bert/encoder/layer_0/attention/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.00511063 -0.0166625 0.02812938 -0.01166061 0.01942627]\n", + "TF: shape: (768,) values: [ 0.00511063 -0.0166625 0.02812938 -0.01166061 0.01942627]\n", + "\n", + "bert/encoder/layer_0/attention/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.25779155 -0.03077853 -0.2772697 -0.38847703 0.36841765]\n", + "TF: shape: (768,) values: [ 0.25779155 -0.03077853 -0.2772697 -0.38847703 0.36841765]\n", + "\n", + "bert/encoder/layer_0/attention/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.9803408 0.959969 0.96368986 0.9603653 0.9801324 ]\n", + "TF: shape: (768,) values: [0.9803408 0.959969 0.96368986 0.9603653 0.9801324 ]\n", + "\n", + "bert/encoder/layer_0/intermediate/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 3072) values: [-0.01010427 -0.060398 -0.01468864 0.00311493 0.02862451]\n", + "TF: shape: (768, 3072) values: [-0.01010427 -0.060398 -0.01468864 0.00311493 0.02862451]\n", + "\n", + "bert/encoder/layer_0/intermediate/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072,) values: [-0.11498757 -0.09629171 -0.12399033 -0.129036 -0.06369043]\n", + "TF: shape: (3072,) values: [-0.11498757 -0.09629171 -0.12399033 -0.129036 -0.06369043]\n", + "\n", + "bert/encoder/layer_0/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072, 768) values: [-0.03710171 0.0648794 0.00758566 -0.05224452 -0.04348791]\n", + "TF: shape: (3072, 768) values: [-0.03710171 0.0648794 0.00758566 -0.05224452 -0.04348791]\n", + "\n", + "bert/encoder/layer_0/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.04801027 0.19766568 0.02154854 0.02880666 0.0444298 ]\n", + "TF: shape: (768,) values: [-0.04801027 0.19766568 0.02154854 0.02880666 0.0444298 ]\n", + "\n", + "bert/encoder/layer_0/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.10142924 -0.00499344 0.04274083 0.09324206 -0.10700516]\n", + "TF: shape: (768,) values: [-0.10142924 -0.00499344 0.04274083 0.09324206 -0.10700516]\n", + "\n", + "bert/encoder/layer_0/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.7835125 0.8072406 0.7670588 0.73706394 0.76303864]\n", + "TF: shape: (768,) values: [0.7835125 0.8072406 0.7670588 0.73706394 0.76303864]\n", + "\n", + "bert/encoder/layer_1/attention/self/query/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.03132744 -0.01340016 -0.07761582 0.0655639 -0.00337808]\n", + "TF: shape: (768, 768) values: [ 0.03132744 -0.01340016 -0.07761582 0.0655639 -0.00337808]\n", + "\n", + "bert/encoder/layer_1/attention/self/query/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.27827993 0.17387655 -0.2497937 -0.8809636 0.41262135]\n", + "TF: shape: (768,) values: [-0.27827993 0.17387655 -0.2497937 -0.8809636 0.41262135]\n", + "\n", + "bert/encoder/layer_1/attention/self/key/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.03353037 0.04007257 0.05320328 -0.02166729 -0.03581231]\n", + "TF: shape: (768, 768) values: [-0.03353037 0.04007257 0.05320328 -0.02166729 -0.03581231]\n", + "\n", + "bert/encoder/layer_1/attention/self/key/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.00504407 0.00136887 -0.00394336 0.00646125 -0.00148919]\n", + "TF: shape: (768,) values: [-0.00504407 0.00136887 -0.00394336 0.00646125 -0.00148919]\n", + "\n", + "bert/encoder/layer_1/attention/self/value/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.00464159 0.06674305 -0.00970626 -0.0276653 -0.01597566]\n", + "TF: shape: (768, 768) values: [-0.00464159 0.06674305 -0.00970626 -0.0276653 -0.01597566]\n", + "\n", + "bert/encoder/layer_1/attention/self/value/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.00381288 0.02650839 -0.0059689 -0.00508269 -0.01293722]\n", + "TF: shape: (768,) values: [ 0.00381288 0.02650839 -0.0059689 -0.00508269 -0.01293722]\n", + "\n", + "bert/encoder/layer_1/attention/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.01390745 -0.01100563 0.01303005 -0.01969771 0.0125082 ]\n", + "TF: shape: (768, 768) values: [-0.01390745 -0.01100563 0.01303005 -0.01969771 0.0125082 ]\n", + "\n", + "bert/encoder/layer_1/attention/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.02946591 0.05715097 0.01293636 0.01920356 0.00805334]\n", + "TF: shape: (768,) values: [0.02946591 0.05715097 0.01293636 0.01920356 0.00805334]\n", + "\n", + "bert/encoder/layer_1/attention/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.08583715 0.14199966 -0.0856637 -0.18797271 0.21056814]\n", + "TF: shape: (768,) values: [ 0.08583715 0.14199966 -0.0856637 -0.18797271 0.21056814]\n", + "\n", + "bert/encoder/layer_1/attention/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.896962 0.87148863 0.8531161 0.8690647 0.9488987 ]\n", + "TF: shape: (768,) values: [0.896962 0.87148863 0.8531161 0.8690647 0.9488987 ]\n", + "\n", + "bert/encoder/layer_1/intermediate/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 3072) values: [ 0.01841293 -0.02650284 -0.09708428 -0.01734244 -0.05529237]\n", + "TF: shape: (768, 3072) values: [ 0.01841293 -0.02650284 -0.09708428 -0.01734244 -0.05529237]\n", + "\n", + "bert/encoder/layer_1/intermediate/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072,) values: [-0.15203774 -0.10449131 -0.08440229 -0.09323178 -0.08511415]\n", + "TF: shape: (3072,) values: [-0.15203774 -0.10449131 -0.08440229 -0.09323178 -0.08511415]\n", + "\n", + "bert/encoder/layer_1/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072, 768) values: [-0.02372648 0.03326349 0.08291997 -0.01519038 0.01868557]\n", + "TF: shape: (3072, 768) values: [-0.02372648 0.03326349 0.08291997 -0.01519038 0.01868557]\n", + "\n", + "bert/encoder/layer_1/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.02514724 0.09868994 -0.027811 0.03749462 0.01086514]\n", + "TF: shape: (768,) values: [-0.02514724 0.09868994 -0.027811 0.03749462 0.01086514]\n", + "\n", + "bert/encoder/layer_1/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.07662535 -0.10506564 0.03191236 0.07633785 -0.11187791]\n", + "TF: shape: (768,) values: [-0.07662535 -0.10506564 0.03191236 0.07633785 -0.11187791]\n", + "\n", + "bert/encoder/layer_1/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.9017883 0.8868776 0.8862677 0.85865664 0.87496454]\n", + "TF: shape: (768,) values: [0.9017883 0.8868776 0.8862677 0.85865664 0.87496454]\n", + "\n", + "bert/encoder/layer_2/attention/self/query/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.08433672 0.09580533 0.07543895 -0.01126779 -0.01354045]\n", + "TF: shape: (768, 768) values: [ 0.08433672 0.09580533 0.07543895 -0.01126779 -0.01354045]\n", + "\n", + "bert/encoder/layer_2/attention/self/query/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.0371241 0.03406003 0.27713948 -0.21613775 -0.05275448]\n", + "TF: shape: (768,) values: [ 0.0371241 0.03406003 0.27713948 -0.21613775 -0.05275448]\n", + "\n", + "bert/encoder/layer_2/attention/self/key/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.04794507 0.02517631 -0.01319554 -0.02094732 0.09073472]\n", + "TF: shape: (768, 768) values: [ 0.04794507 0.02517631 -0.01319554 -0.02094732 0.09073472]\n", + "\n", + "bert/encoder/layer_2/attention/self/key/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.00037404 -0.00125881 -0.00114734 -0.00157741 0.00037122]\n", + "TF: shape: (768,) values: [-0.00037404 -0.00125881 -0.00114734 -0.00157741 0.00037122]\n", + "\n", + "bert/encoder/layer_2/attention/self/value/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.01119406 -0.01488636 -0.02960914 0.04746444 0.00428481]\n", + "TF: shape: (768, 768) values: [-0.01119406 -0.01488636 -0.02960914 0.04746444 0.00428481]\n", + "\n", + "bert/encoder/layer_2/attention/self/value/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.02728729 0.04979054 0.08326469 0.04150949 0.600959 ]\n", + "TF: shape: (768,) values: [-0.02728729 0.04979054 0.08326469 0.04150949 0.600959 ]\n", + "\n", + "bert/encoder/layer_2/attention/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.00517425 0.01197957 0.0393172 -0.0063884 -0.02673388]\n", + "TF: shape: (768, 768) values: [ 0.00517425 0.01197957 0.0393172 -0.0063884 -0.02673388]\n", + "\n", + "bert/encoder/layer_2/attention/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.01754025 0.1226335 -0.05733554 0.06844623 0.00879776]\n", + "TF: shape: (768,) values: [ 0.01754025 0.1226335 -0.05733554 0.06844623 0.00879776]\n", + "\n", + "bert/encoder/layer_2/attention/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.1490809 0.12386955 -0.19382021 -0.26515856 0.32723007]\n", + "TF: shape: (768,) values: [ 0.1490809 0.12386955 -0.19382021 -0.26515856 0.32723007]\n", + "\n", + "bert/encoder/layer_2/attention/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.8983343 0.88877076 0.86283594 0.8584952 0.9587886 ]\n", + "TF: shape: (768,) values: [0.8983343 0.88877076 0.86283594 0.8584952 0.9587886 ]\n", + "\n", + "bert/encoder/layer_2/intermediate/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 3072) values: [-0.01619919 0.00662888 0.01492284 -0.01280748 0.01318596]\n", + "TF: shape: (768, 3072) values: [-0.01619919 0.00662888 0.01492284 -0.01280748 0.01318596]\n", + "\n", + "bert/encoder/layer_2/intermediate/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072,) values: [-0.08474881 -0.12850781 -0.11550345 -0.09513011 -0.02519853]\n", + "TF: shape: (3072,) values: [-0.08474881 -0.12850781 -0.11550345 -0.09513011 -0.02519853]\n", + "\n", + "bert/encoder/layer_2/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072, 768) values: [-0.07225161 -0.0129784 0.00618811 -0.01593373 -0.02160194]\n", + "TF: shape: (3072, 768) values: [-0.07225161 -0.0129784 0.00618811 -0.01593373 -0.02160194]\n", + "\n", + "bert/encoder/layer_2/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.06319264 0.06169628 -0.03041368 0.00924282 0.06277442]\n", + "TF: shape: (768,) values: [-0.06319264 0.06169628 -0.03041368 0.00924282 0.06277442]\n", + "\n", + "bert/encoder/layer_2/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.1139038 -0.11665309 0.07883061 0.07796711 -0.14219187]\n", + "TF: shape: (768,) values: [-0.1139038 -0.11665309 0.07883061 0.07796711 -0.14219187]\n", + "\n", + "bert/encoder/layer_2/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.8813261 0.85744697 0.8511922 0.85261875 0.8329574 ]\n", + "TF: shape: (768,) values: [0.8813261 0.85744697 0.8511922 0.85261875 0.8329574 ]\n", + "\n", + "bert/encoder/layer_3/attention/self/query/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.05855456 -0.00111438 -0.00828963 0.04117409 -0.07591715]\n", + "TF: shape: (768, 768) values: [ 0.05855456 -0.00111438 -0.00828963 0.04117409 -0.07591715]\n", + "\n", + "bert/encoder/layer_3/attention/self/query/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.09740101 -0.19290674 0.04332267 0.17937997 -0.08023558]\n", + "TF: shape: (768,) values: [ 0.09740101 -0.19290674 0.04332267 0.17937997 -0.08023558]\n", + "\n", + "bert/encoder/layer_3/attention/self/key/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.02562077 0.02507281 -0.03361562 0.05613289 -0.05435724]\n", + "TF: shape: (768, 768) values: [ 0.02562077 0.02507281 -0.03361562 0.05613289 -0.05435724]\n", + "\n", + "bert/encoder/layer_3/attention/self/key/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.00188639 -0.00379197 -0.01020415 0.00969649 -0.00094182]\n", + "TF: shape: (768,) values: [ 0.00188639 -0.00379197 -0.01020415 0.00969649 -0.00094182]\n", + "\n", + "bert/encoder/layer_3/attention/self/value/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.00539032 0.00959642 0.01325458 0.00490616 0.0129908 ]\n", + "TF: shape: (768, 768) values: [-0.00539032 0.00959642 0.01325458 0.00490616 0.0129908 ]\n", + "\n", + "bert/encoder/layer_3/attention/self/value/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.04573824 0.05405985 0.00681163 0.00655945 0.01141771]\n", + "TF: shape: (768,) values: [0.04573824 0.05405985 0.00681163 0.00655945 0.01141771]\n", + "\n", + "bert/encoder/layer_3/attention/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.01850341 0.03148198 0.02705758 -0.0004669 0.01367511]\n", + "TF: shape: (768, 768) values: [ 0.01850341 0.03148198 0.02705758 -0.0004669 0.01367511]\n", + "\n", + "bert/encoder/layer_3/attention/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.01981483 0.03566506 -0.05016088 0.02958186 0.04989756]\n", + "TF: shape: (768,) values: [ 0.01981483 0.03566506 -0.05016088 0.02958186 0.04989756]\n", + "\n", + "bert/encoder/layer_3/attention/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.09815404 0.00063774 -0.01257733 -0.26485074 0.22568701]\n", + "TF: shape: (768,) values: [ 0.09815404 0.00063774 -0.01257733 -0.26485074 0.22568701]\n", + "\n", + "bert/encoder/layer_3/attention/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.91457725 0.88453823 0.8340887 0.84203583 0.95247847]\n", + "TF: shape: (768,) values: [0.91457725 0.88453823 0.8340887 0.84203583 0.95247847]\n", + "\n", + "bert/encoder/layer_3/intermediate/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 3072) values: [-0.02733567 0.03307878 -0.01331292 -0.00032527 0.03252084]\n", + "TF: shape: (768, 3072) values: [-0.02733567 0.03307878 -0.01331292 -0.00032527 0.03252084]\n", + "\n", + "bert/encoder/layer_3/intermediate/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072,) values: [-0.11436842 -0.15038085 -0.07842971 0.01335877 -0.09492484]\n", + "TF: shape: (3072,) values: [-0.11436842 -0.15038085 -0.07842971 0.01335877 -0.09492484]\n", + "\n", + "bert/encoder/layer_3/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072, 768) values: [-0.01751153 0.01631314 -0.02660011 0.03569947 -0.01394763]\n", + "TF: shape: (3072, 768) values: [-0.01751153 0.01631314 -0.02660011 0.03569947 -0.01394763]\n", + "\n", + "bert/encoder/layer_3/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.03873252 0.08414765 -0.0399323 0.01997361 0.12924597]\n", + "TF: shape: (768,) values: [-0.03873252 0.08414765 -0.0399323 0.01997361 0.12924597]\n", + "\n", + "bert/encoder/layer_3/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.08049371 -0.06923949 -0.03357155 0.05231095 -0.09717073]\n", + "TF: shape: (768,) values: [-0.08049371 -0.06923949 -0.03357155 0.05231095 -0.09717073]\n", + "\n", + "bert/encoder/layer_3/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.827748 0.83012533 0.82399255 0.81772 0.80794513]\n", + "TF: shape: (768,) values: [0.827748 0.83012533 0.82399255 0.81772 0.80794513]\n", + "\n", + "bert/encoder/layer_4/attention/self/query/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.08296382 0.02076941 0.06525186 -0.02659729 0.03491377]\n", + "TF: shape: (768, 768) values: [ 0.08296382 0.02076941 0.06525186 -0.02659729 0.03491377]\n", + "\n", + "bert/encoder/layer_4/attention/self/query/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.07045844 -0.13412629 -0.0514146 0.00061329 0.1248519 ]\n", + "TF: shape: (768,) values: [ 0.07045844 -0.13412629 -0.0514146 0.00061329 0.1248519 ]\n", + "\n", + "bert/encoder/layer_4/attention/self/key/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.06941643 0.08133814 -0.0453992 0.0668715 -0.06014847]\n", + "TF: shape: (768, 768) values: [ 0.06941643 0.08133814 -0.0453992 0.0668715 -0.06014847]\n", + "\n", + "bert/encoder/layer_4/attention/self/key/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.00588725 -0.00235185 0.00281131 0.00173088 -0.00546653]\n", + "TF: shape: (768,) values: [-0.00588725 -0.00235185 0.00281131 0.00173088 -0.00546653]\n", + "\n", + "bert/encoder/layer_4/attention/self/value/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.06889665 0.06645385 0.01232084 0.0132611 -0.01595679]\n", + "TF: shape: (768, 768) values: [ 0.06889665 0.06645385 0.01232084 0.0132611 -0.01595679]\n", + "\n", + "bert/encoder/layer_4/attention/self/value/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.01126871 -0.02704018 0.0301532 0.02332082 -0.04233487]\n", + "TF: shape: (768,) values: [-0.01126871 -0.02704018 0.0301532 0.02332082 -0.04233487]\n", + "\n", + "bert/encoder/layer_4/attention/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.02285513 -0.04172142 -0.0146292 0.04862929 -0.0442014 ]\n", + "TF: shape: (768, 768) values: [ 0.02285513 -0.04172142 -0.0146292 0.04862929 -0.0442014 ]\n", + "\n", + "bert/encoder/layer_4/attention/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.03054528 0.00479777 -0.02729505 -0.0325212 -0.00525727]\n", + "TF: shape: (768,) values: [ 0.03054528 0.00479777 -0.02729505 -0.0325212 -0.00525727]\n", + "\n", + "bert/encoder/layer_4/attention/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.00903359 0.0052285 -0.02841488 -0.22355485 0.28281343]\n", + "TF: shape: (768,) values: [ 0.00903359 0.0052285 -0.02841488 -0.22355485 0.28281343]\n", + "\n", + "bert/encoder/layer_4/attention/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.8849676 0.86927813 0.8114595 0.80269504 0.94864094]\n", + "TF: shape: (768,) values: [0.8849676 0.86927813 0.8114595 0.80269504 0.94864094]\n", + "\n", + "bert/encoder/layer_4/intermediate/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 3072) values: [-0.00639783 0.06198016 -0.03184223 0.00485356 -0.02453273]\n", + "TF: shape: (768, 3072) values: [-0.00639783 0.06198016 -0.03184223 0.00485356 -0.02453273]\n", + "\n", + "bert/encoder/layer_4/intermediate/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072,) values: [-0.08770327 -0.11779705 -0.11764182 -0.00192611 -0.1335473 ]\n", + "TF: shape: (3072,) values: [-0.08770327 -0.11779705 -0.11764182 -0.00192611 -0.1335473 ]\n", + "\n", + "bert/encoder/layer_4/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072, 768) values: [-0.05421264 0.0221118 -0.02674172 0.03672203 -0.02399626]\n", + "TF: shape: (3072, 768) values: [-0.05421264 0.0221118 -0.02674172 0.03672203 -0.02399626]\n", + "\n", + "bert/encoder/layer_4/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.05068972 0.04838871 0.01156022 0.05381602 0.08857913]\n", + "TF: shape: (768,) values: [-0.05068972 0.04838871 0.01156022 0.05381602 0.08857913]\n", + "\n", + "bert/encoder/layer_4/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.04338909 -0.0781464 -0.01518662 0.04936362 -0.12378412]\n", + "TF: shape: (768,) values: [-0.04338909 -0.0781464 -0.01518662 0.04936362 -0.12378412]\n", + "\n", + "bert/encoder/layer_4/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.8734387 0.8576282 0.8339444 0.8450325 0.8105372]\n", + "TF: shape: (768,) values: [0.8734387 0.8576282 0.8339444 0.8450325 0.8105372]\n", + "\n", + "bert/encoder/layer_5/attention/self/query/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.00858843 -0.03920127 0.02552994 -0.02786552 0.02436485]\n", + "TF: shape: (768, 768) values: [-0.00858843 -0.03920127 0.02552994 -0.02786552 0.02436485]\n", + "\n", + "bert/encoder/layer_5/attention/self/query/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.00859117 -0.01642405 -0.04391079 0.01085692 0.02925887]\n", + "TF: shape: (768,) values: [-0.00859117 -0.01642405 -0.04391079 0.01085692 0.02925887]\n", + "\n", + "bert/encoder/layer_5/attention/self/key/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.00352847 0.02330176 -0.00369894 -0.03904612 0.00294574]\n", + "TF: shape: (768, 768) values: [ 0.00352847 0.02330176 -0.00369894 -0.03904612 0.00294574]\n", + "\n", + "bert/encoder/layer_5/attention/self/key/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.01087186 -0.01176561 0.00016575 -0.01163023 0.00946616]\n", + "TF: shape: (768,) values: [-0.01087186 -0.01176561 0.00016575 -0.01163023 0.00946616]\n", + "\n", + "bert/encoder/layer_5/attention/self/value/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.06134222 0.04238288 0.02796064 -0.01284983 0.03683741]\n", + "TF: shape: (768, 768) values: [ 0.06134222 0.04238288 0.02796064 -0.01284983 0.03683741]\n", + "\n", + "bert/encoder/layer_5/attention/self/value/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.05061118 -0.02954445 -0.0034053 -0.00025261 0.0437019 ]\n", + "TF: shape: (768,) values: [ 0.05061118 -0.02954445 -0.0034053 -0.00025261 0.0437019 ]\n", + "\n", + "bert/encoder/layer_5/attention/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.00739815 0.0533964 -0.03736389 -0.04999201 0.01693069]\n", + "TF: shape: (768, 768) values: [-0.00739815 0.0533964 -0.03736389 -0.04999201 0.01693069]\n", + "\n", + "bert/encoder/layer_5/attention/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.0021682 0.01711399 -0.04201518 0.01605333 0.00552063]\n", + "TF: shape: (768,) values: [-0.0021682 0.01711399 -0.04201518 0.01605333 0.00552063]\n", + "\n", + "bert/encoder/layer_5/attention/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.06841327 -0.0146848 0.09792476 -0.23284538 0.2785602 ]\n", + "TF: shape: (768,) values: [-0.06841327 -0.0146848 0.09792476 -0.23284538 0.2785602 ]\n", + "\n", + "bert/encoder/layer_5/attention/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.8908311 0.87884724 0.81637293 0.8047641 0.96539867]\n", + "TF: shape: (768,) values: [0.8908311 0.87884724 0.81637293 0.8047641 0.96539867]\n", + "\n", + "bert/encoder/layer_5/intermediate/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 3072) values: [-0.03246041 0.07251058 -0.08201726 0.00772481 0.02532209]\n", + "TF: shape: (768, 3072) values: [-0.03246041 0.07251058 -0.08201726 0.00772481 0.02532209]\n", + "\n", + "bert/encoder/layer_5/intermediate/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072,) values: [-0.09689714 -0.27696273 -0.13047501 -0.10892326 -0.1057625 ]\n", + "TF: shape: (3072,) values: [-0.09689714 -0.27696273 -0.13047501 -0.10892326 -0.1057625 ]\n", + "\n", + "bert/encoder/layer_5/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072, 768) values: [ 0.0642072 -0.01738782 -0.05095377 0.00523853 0.04425264]\n", + "TF: shape: (3072, 768) values: [ 0.0642072 -0.01738782 -0.05095377 0.00523853 0.04425264]\n", + "\n", + "bert/encoder/layer_5/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.0007217 0.06006297 0.0016595 0.03848181 0.06703516]\n", + "TF: shape: (768,) values: [-0.0007217 0.06006297 0.0016595 0.03848181 0.06703516]\n", + "\n", + "bert/encoder/layer_5/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.00278729 -0.05594506 -0.0631047 0.06023621 -0.18672828]\n", + "TF: shape: (768,) values: [-0.00278729 -0.05594506 -0.0631047 0.06023621 -0.18672828]\n", + "\n", + "bert/encoder/layer_5/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.8621183 0.8515807 0.82654256 0.81729776 0.7985204 ]\n", + "TF: shape: (768,) values: [0.8621183 0.8515807 0.82654256 0.81729776 0.7985204 ]\n", + "\n", + "bert/encoder/layer_6/attention/self/query/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.02527807 -0.01429243 0.01467054 0.08624706 -0.00188593]\n", + "TF: shape: (768, 768) values: [-0.02527807 -0.01429243 0.01467054 0.08624706 -0.00188593]\n", + "\n", + "bert/encoder/layer_6/attention/self/query/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.17319514 0.27564248 0.16801168 -0.10946485 0.1643271 ]\n", + "TF: shape: (768,) values: [-0.17319514 0.27564248 0.16801168 -0.10946485 0.1643271 ]\n", + "\n", + "bert/encoder/layer_6/attention/self/key/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.05886372 0.00706217 0.0398422 0.00882155 -0.04571463]\n", + "TF: shape: (768, 768) values: [ 0.05886372 0.00706217 0.0398422 0.00882155 -0.04571463]\n", + "\n", + "bert/encoder/layer_6/attention/self/key/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.00424696 -0.0001192 0.0046079 -0.00315606 0.00434314]\n", + "TF: shape: (768,) values: [-0.00424696 -0.0001192 0.0046079 -0.00315606 0.00434314]\n", + "\n", + "bert/encoder/layer_6/attention/self/value/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.01720381 0.01170722 0.02346902 -0.02284313 -0.03173028]\n", + "TF: shape: (768, 768) values: [-0.01720381 0.01170722 0.02346902 -0.02284313 -0.03173028]\n", + "\n", + "bert/encoder/layer_6/attention/self/value/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.03492057 0.01813157 -0.00182878 -0.01420629 -0.00508944]\n", + "TF: shape: (768,) values: [-0.03492057 0.01813157 -0.00182878 -0.01420629 -0.00508944]\n", + "\n", + "bert/encoder/layer_6/attention/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.0323688 -0.00689882 0.07379091 0.01121114 -0.02059202]\n", + "TF: shape: (768, 768) values: [ 0.0323688 -0.00689882 0.07379091 0.01121114 -0.02059202]\n", + "\n", + "bert/encoder/layer_6/attention/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.00648672 -0.05935453 -0.05673229 -0.01152384 -0.02766573]\n", + "TF: shape: (768,) values: [-0.00648672 -0.05935453 -0.05673229 -0.01152384 -0.02766573]\n", + "\n", + "bert/encoder/layer_6/attention/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.06793639 0.03157783 0.15647687 -0.15025291 0.14727171]\n", + "TF: shape: (768,) values: [-0.06793639 0.03157783 0.15647687 -0.15025291 0.14727171]\n", + "\n", + "bert/encoder/layer_6/attention/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.8882361 0.8704905 0.80289173 0.77365315 0.92333615]\n", + "TF: shape: (768,) values: [0.8882361 0.8704905 0.80289173 0.77365315 0.92333615]\n", + "\n", + "bert/encoder/layer_6/intermediate/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 3072) values: [ 0.04492201 0.05160861 0.09041415 -0.00742628 0.048133 ]\n", + "TF: shape: (768, 3072) values: [ 0.04492201 0.05160861 0.09041415 -0.00742628 0.048133 ]\n", + "\n", + "bert/encoder/layer_6/intermediate/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072,) values: [-0.09301704 -0.158612 -0.10633879 -0.09706812 -0.17319229]\n", + "TF: shape: (3072,) values: [-0.09301704 -0.158612 -0.10633879 -0.09706812 -0.17319229]\n", + "\n", + "bert/encoder/layer_6/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072, 768) values: [-0.00085372 -0.00974195 0.00684915 0.00038686 0.06610142]\n", + "TF: shape: (3072, 768) values: [-0.00085372 -0.00974195 0.00684915 0.00038686 0.06610142]\n", + "\n", + "bert/encoder/layer_6/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.03254414 0.05681704 0.03720434 0.01936359 0.09134153]\n", + "TF: shape: (768,) values: [-0.03254414 0.05681704 0.03720434 0.01936359 0.09134153]\n", + "\n", + "bert/encoder/layer_6/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.0117129 -0.03209404 -0.08646043 0.03760341 -0.13841423]\n", + "TF: shape: (768,) values: [-0.0117129 -0.03209404 -0.08646043 0.03760341 -0.13841423]\n", + "\n", + "bert/encoder/layer_6/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.8674175 0.8657014 0.8151861 0.82301307 0.8305737 ]\n", + "TF: shape: (768,) values: [0.8674175 0.8657014 0.8151861 0.82301307 0.8305737 ]\n", + "\n", + "bert/encoder/layer_7/attention/self/query/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.00075523 -0.01501983 0.04090893 0.01884826 0.04670674]\n", + "TF: shape: (768, 768) values: [-0.00075523 -0.01501983 0.04090893 0.01884826 0.04670674]\n", + "\n", + "bert/encoder/layer_7/attention/self/query/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.0010344 -0.00423982 0.3117479 0.04494623 -0.01260845]\n", + "TF: shape: (768,) values: [ 0.0010344 -0.00423982 0.3117479 0.04494623 -0.01260845]\n", + "\n", + "bert/encoder/layer_7/attention/self/key/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.02781927 -0.00906972 0.02121989 0.0298591 0.05854786]\n", + "TF: shape: (768, 768) values: [ 0.02781927 -0.00906972 0.02121989 0.0298591 0.05854786]\n", + "\n", + "bert/encoder/layer_7/attention/self/key/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.00074918 0.00731079 0.00089338 0.00345652 0.00043817]\n", + "TF: shape: (768,) values: [-0.00074918 0.00731079 0.00089338 0.00345652 0.00043817]\n", + "\n", + "bert/encoder/layer_7/attention/self/value/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.01080035 -0.03468366 0.03167168 0.01583073 0.0327719 ]\n", + "TF: shape: (768, 768) values: [-0.01080035 -0.03468366 0.03167168 0.01583073 0.0327719 ]\n", + "\n", + "bert/encoder/layer_7/attention/self/value/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.02824226 0.01605172 0.00067929 -0.04553111 0.0076044 ]\n", + "TF: shape: (768,) values: [-0.02824226 0.01605172 0.00067929 -0.04553111 0.0076044 ]\n", + "\n", + "bert/encoder/layer_7/attention/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.05496112 0.01006968 0.02206531 -0.01873116 0.02149118]\n", + "TF: shape: (768, 768) values: [-0.05496112 0.01006968 0.02206531 -0.01873116 0.02149118]\n", + "\n", + "bert/encoder/layer_7/attention/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.00349772 -0.05831751 -0.0594084 -0.0342187 0.02965918]\n", + "TF: shape: (768,) values: [ 0.00349772 -0.05831751 -0.0594084 -0.0342187 0.02965918]\n", + "\n", + "bert/encoder/layer_7/attention/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.02826844 0.04427591 0.05678326 -0.0475907 0.16136196]\n", + "TF: shape: (768,) values: [-0.02826844 0.04427591 0.05678326 -0.0475907 0.16136196]\n", + "\n", + "bert/encoder/layer_7/attention/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.8742141 0.870608 0.79147685 0.7595279 0.9223656 ]\n", + "TF: shape: (768,) values: [0.8742141 0.870608 0.79147685 0.7595279 0.9223656 ]\n", + "\n", + "bert/encoder/layer_7/intermediate/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 3072) values: [ 0.03598932 -0.12225644 0.03019998 0.05691092 0.03717208]\n", + "TF: shape: (768, 3072) values: [ 0.03598932 -0.12225644 0.03019998 0.05691092 0.03717208]\n", + "\n", + "bert/encoder/layer_7/intermediate/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072,) values: [-0.12465011 -0.08639494 -0.06206005 -0.08012587 -0.08773767]\n", + "TF: shape: (3072,) values: [-0.12465011 -0.08639494 -0.06206005 -0.08012587 -0.08773767]\n", + "\n", + "bert/encoder/layer_7/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072, 768) values: [-0.02190432 -0.02279165 0.03279508 0.01011065 -0.07793335]\n", + "TF: shape: (3072, 768) values: [-0.02190432 -0.02279165 0.03279508 0.01011065 -0.07793335]\n", + "\n", + "bert/encoder/layer_7/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.04282642 0.03700675 0.06142357 -0.04787201 0.02958163]\n", + "TF: shape: (768,) values: [-0.04282642 0.03700675 0.06142357 -0.04787201 0.02958163]\n", + "\n", + "bert/encoder/layer_7/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.03142036 -0.04358427 -0.05132087 -0.01788123 -0.16399944]\n", + "TF: shape: (768,) values: [-0.03142036 -0.04358427 -0.05132087 -0.01788123 -0.16399944]\n", + "\n", + "bert/encoder/layer_7/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.83858097 0.8179645 0.80693793 0.81225365 0.7844832 ]\n", + "TF: shape: (768,) values: [0.83858097 0.8179645 0.80693793 0.81225365 0.7844832 ]\n", + "\n", + "bert/encoder/layer_8/attention/self/query/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [0.0448719 0.02289526 0.03083764 0.03048073 0.02436891]\n", + "TF: shape: (768, 768) values: [0.0448719 0.02289526 0.03083764 0.03048073 0.02436891]\n", + "\n", + "bert/encoder/layer_8/attention/self/query/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.25132924 -0.23753347 0.02581017 0.00901509 0.18424493]\n", + "TF: shape: (768,) values: [-0.25132924 -0.23753347 0.02581017 0.00901509 0.18424493]\n", + "\n", + "bert/encoder/layer_8/attention/self/key/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.01999719 0.00711403 0.03949134 -0.0102224 0.03152475]\n", + "TF: shape: (768, 768) values: [-0.01999719 0.00711403 0.03949134 -0.0102224 0.03152475]\n", + "\n", + "bert/encoder/layer_8/attention/self/key/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 5.5668897e-05 3.4638541e-03 -1.7605867e-03 -6.1321147e-03\n", + " -4.4074579e-04]\n", + "TF: shape: (768,) values: [ 5.5668897e-05 3.4638541e-03 -1.7605867e-03 -6.1321147e-03\n", + " -4.4074579e-04]\n", + "\n", + "bert/encoder/layer_8/attention/self/value/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.00736056 -0.01795213 0.00104576 -0.00034653 0.03190543]\n", + "TF: shape: (768, 768) values: [-0.00736056 -0.01795213 0.00104576 -0.00034653 0.03190543]\n", + "\n", + "bert/encoder/layer_8/attention/self/value/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.02892835 0.00642501 -0.03608712 0.00264269 -0.0245198 ]\n", + "TF: shape: (768,) values: [ 0.02892835 0.00642501 -0.03608712 0.00264269 -0.0245198 ]\n", + "\n", + "bert/encoder/layer_8/attention/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.03971623 0.05307067 -0.01298818 0.00946693 -0.00121235]\n", + "TF: shape: (768, 768) values: [ 0.03971623 0.05307067 -0.01298818 0.00946693 -0.00121235]\n", + "\n", + "bert/encoder/layer_8/attention/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.01468131 -0.05406622 -0.06289103 0.004484 0.0240819 ]\n", + "TF: shape: (768,) values: [ 0.01468131 -0.05406622 -0.06289103 0.004484 0.0240819 ]\n", + "\n", + "bert/encoder/layer_8/attention/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.06004262 0.0457275 0.08688109 -0.14416659 -0.05500487]\n", + "TF: shape: (768,) values: [-0.06004262 0.0457275 0.08688109 -0.14416659 -0.05500487]\n", + "\n", + "bert/encoder/layer_8/attention/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.8907534 0.89116573 0.811639 0.7810443 0.9045574 ]\n", + "TF: shape: (768,) values: [0.8907534 0.89116573 0.811639 0.7810443 0.9045574 ]\n", + "\n", + "bert/encoder/layer_8/intermediate/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 3072) values: [-0.01962814 -0.01482586 -0.02292624 0.03397145 0.02457482]\n", + "TF: shape: (768, 3072) values: [-0.01962814 -0.01482586 -0.02292624 0.03397145 0.02457482]\n", + "\n", + "bert/encoder/layer_8/intermediate/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072,) values: [-0.08129632 -0.1691108 -0.10681771 -0.10392351 -0.13120006]\n", + "TF: shape: (3072,) values: [-0.08129632 -0.1691108 -0.10681771 -0.10392351 -0.13120006]\n", + "\n", + "bert/encoder/layer_8/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072, 768) values: [-0.04683433 -0.02690669 0.02979059 0.02223369 -0.00130287]\n", + "TF: shape: (3072, 768) values: [-0.04683433 -0.02690669 0.02979059 0.02223369 -0.00130287]\n", + "\n", + "bert/encoder/layer_8/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.09155537 -0.04465394 0.05649116 -0.09628641 0.11875238]\n", + "TF: shape: (768,) values: [-0.09155537 -0.04465394 0.05649116 -0.09628641 0.11875238]\n", + "\n", + "bert/encoder/layer_8/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.06043394 -0.06657387 -0.05341128 -0.00374733 -0.10855272]\n", + "TF: shape: (768,) values: [-0.06043394 -0.06657387 -0.05341128 -0.00374733 -0.10855272]\n", + "\n", + "bert/encoder/layer_8/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.84467345 0.84421015 0.82582206 0.84553087 0.8207573 ]\n", + "TF: shape: (768,) values: [0.84467345 0.84421015 0.82582206 0.84553087 0.8207573 ]\n", + "\n", + "bert/encoder/layer_9/attention/self/query/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.08004542 -0.0143706 -0.04219061 -0.05175152 -0.01147588]\n", + "TF: shape: (768, 768) values: [ 0.08004542 -0.0143706 -0.04219061 -0.05175152 -0.01147588]\n", + "\n", + "bert/encoder/layer_9/attention/self/query/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.14508031 0.40926442 -0.3281781 -0.02869792 -0.26104516]\n", + "TF: shape: (768,) values: [-0.14508031 0.40926442 -0.3281781 -0.02869792 -0.26104516]\n", + "\n", + "bert/encoder/layer_9/attention/self/key/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.01337681 0.00615428 -0.0455939 0.03379053 -0.01992556]\n", + "TF: shape: (768, 768) values: [-0.01337681 0.00615428 -0.0455939 0.03379053 -0.01992556]\n", + "\n", + "bert/encoder/layer_9/attention/self/key/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.0051302 0.0083288 0.00377641 0.00928865 -0.00418182]\n", + "TF: shape: (768,) values: [-0.0051302 0.0083288 0.00377641 0.00928865 -0.00418182]\n", + "\n", + "bert/encoder/layer_9/attention/self/value/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.02485976 -0.0301923 0.00984638 -0.02495162 0.01074037]\n", + "TF: shape: (768, 768) values: [-0.02485976 -0.0301923 0.00984638 -0.02495162 0.01074037]\n", + "\n", + "bert/encoder/layer_9/attention/self/value/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.04229928 -0.02636711 0.0060447 0.00222829 0.04979481]\n", + "TF: shape: (768,) values: [-0.04229928 -0.02636711 0.0060447 0.00222829 0.04979481]\n", + "\n", + "bert/encoder/layer_9/attention/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.01258144 0.00871274 0.00482882 -0.00675888 -0.04390825]\n", + "TF: shape: (768, 768) values: [-0.01258144 0.00871274 0.00482882 -0.00675888 -0.04390825]\n", + "\n", + "bert/encoder/layer_9/attention/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.02457753 0.05051134 -0.06890804 -0.00962795 0.00864793]\n", + "TF: shape: (768,) values: [ 0.02457753 0.05051134 -0.06890804 -0.00962795 0.00864793]\n", + "\n", + "bert/encoder/layer_9/attention/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.08963391 -0.06362236 0.0676669 -0.09895685 0.08318913]\n", + "TF: shape: (768,) values: [-0.08963391 -0.06362236 0.0676669 -0.09895685 0.08318913]\n", + "\n", + "bert/encoder/layer_9/attention/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.85100883 0.82569736 0.7927931 0.7660444 0.8912934 ]\n", + "TF: shape: (768,) values: [0.85100883 0.82569736 0.7927931 0.7660444 0.8912934 ]\n", + "\n", + "bert/encoder/layer_9/intermediate/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 3072) values: [ 0.06290598 0.0203122 -0.05384256 0.05442941 0.00484769]\n", + "TF: shape: (768, 3072) values: [ 0.06290598 0.0203122 -0.05384256 0.05442941 0.00484769]\n", + "\n", + "bert/encoder/layer_9/intermediate/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072,) values: [-0.10818483 -0.00169527 -0.08962701 -0.10280421 -0.14310956]\n", + "TF: shape: (3072,) values: [-0.10818483 -0.00169527 -0.08962701 -0.10280421 -0.14310956]\n", + "\n", + "bert/encoder/layer_9/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072, 768) values: [ 0.05487705 0.01644666 0.00436198 -0.00490768 -0.03238423]\n", + "TF: shape: (3072, 768) values: [ 0.05487705 0.01644666 0.00436198 -0.00490768 -0.03238423]\n", + "\n", + "bert/encoder/layer_9/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.08755219 -0.01910074 -0.02988298 -0.08150438 0.09897955]\n", + "TF: shape: (768,) values: [-0.08755219 -0.01910074 -0.02988298 -0.08150438 0.09897955]\n", + "\n", + "bert/encoder/layer_9/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.04136161 -0.02113917 -0.07581077 -0.00809791 -0.09790538]\n", + "TF: shape: (768,) values: [-0.04136161 -0.02113917 -0.07581077 -0.00809791 -0.09790538]\n", + "\n", + "bert/encoder/layer_9/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.8250572 0.83477134 0.7794141 0.81264955 0.7827918 ]\n", + "TF: shape: (768,) values: [0.8250572 0.83477134 0.7794141 0.81264955 0.7827918 ]\n", + "\n", + "bert/encoder/layer_10/attention/self/query/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.00071212 -0.00853064 0.01776993 0.03189976 0.02183623]\n", + "TF: shape: (768, 768) values: [ 0.00071212 -0.00853064 0.01776993 0.03189976 0.02183623]\n", + "\n", + "bert/encoder/layer_10/attention/self/query/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.03667567 -0.01449654 -0.03822913 0.00118343 -0.05489838]\n", + "TF: shape: (768,) values: [-0.03667567 -0.01449654 -0.03822913 0.00118343 -0.05489838]\n", + "\n", + "bert/encoder/layer_10/attention/self/key/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.0494106 0.05531096 -0.02459413 -0.06019118 -0.02829785]\n", + "TF: shape: (768, 768) values: [-0.0494106 0.05531096 -0.02459413 -0.06019118 -0.02829785]\n", + "\n", + "bert/encoder/layer_10/attention/self/key/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.00692997 0.00855893 0.00670777 -0.0052475 -0.00017074]\n", + "TF: shape: (768,) values: [-0.00692997 0.00855893 0.00670777 -0.0052475 -0.00017074]\n", + "\n", + "bert/encoder/layer_10/attention/self/value/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.01911842 0.04858809 -0.02608485 0.00794924 -0.02246636]\n", + "TF: shape: (768, 768) values: [ 0.01911842 0.04858809 -0.02608485 0.00794924 -0.02246636]\n", + "\n", + "bert/encoder/layer_10/attention/self/value/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.0133503 -0.01224133 -0.0051834 -0.00232528 0.00148614]\n", + "TF: shape: (768,) values: [-0.0133503 -0.01224133 -0.0051834 -0.00232528 0.00148614]\n", + "\n", + "bert/encoder/layer_10/attention/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.05904732 0.02616 0.00794104 -0.02889086 -0.03692576]\n", + "TF: shape: (768, 768) values: [-0.05904732 0.02616 0.00794104 -0.02889086 -0.03692576]\n", + "\n", + "bert/encoder/layer_10/attention/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.02089205 0.01458059 0.05217785 0.0324267 0.00907548]\n", + "TF: shape: (768,) values: [0.02089205 0.01458059 0.05217785 0.0324267 0.00907548]\n", + "\n", + "bert/encoder/layer_10/attention/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.10986238 -0.04332284 0.02603893 -0.06236923 0.14469369]\n", + "TF: shape: (768,) values: [-0.10986238 -0.04332284 0.02603893 -0.06236923 0.14469369]\n", + "\n", + "bert/encoder/layer_10/attention/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.8515822 0.81392974 0.836747 0.78040504 0.88091415]\n", + "TF: shape: (768,) values: [0.8515822 0.81392974 0.836747 0.78040504 0.88091415]\n", + "\n", + "bert/encoder/layer_10/intermediate/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 3072) values: [-0.07061081 0.06997397 0.01433633 0.04150929 0.02865192]\n", + "TF: shape: (768, 3072) values: [-0.07061081 0.06997397 0.01433633 0.04150929 0.02865192]\n", + "\n", + "bert/encoder/layer_10/intermediate/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072,) values: [-0.13879126 -0.06401426 -0.1408043 -0.15043251 -0.10193057]\n", + "TF: shape: (3072,) values: [-0.13879126 -0.06401426 -0.1408043 -0.15043251 -0.10193057]\n", + "\n", + "bert/encoder/layer_10/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072, 768) values: [ 0.02918765 0.02609882 -0.02259856 0.01636725 -0.00038442]\n", + "TF: shape: (3072, 768) values: [ 0.02918765 0.02609882 -0.02259856 0.01636725 -0.00038442]\n", + "\n", + "bert/encoder/layer_10/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.01799502 0.10970547 -0.02384165 -0.03350981 0.10491351]\n", + "TF: shape: (768,) values: [-0.01799502 0.10970547 -0.02384165 -0.03350981 0.10491351]\n", + "\n", + "bert/encoder/layer_10/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.00999107 -0.0217309 -0.0854177 -0.01109101 -0.07902174]\n", + "TF: shape: (768,) values: [ 0.00999107 -0.0217309 -0.0854177 -0.01109101 -0.07902174]\n", + "\n", + "bert/encoder/layer_10/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.8272796 0.8597452 0.79116803 0.81267637 0.8273501 ]\n", + "TF: shape: (768,) values: [0.8272796 0.8597452 0.79116803 0.81267637 0.8273501 ]\n", + "\n", + "bert/encoder/layer_11/attention/self/query/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.04141425 -0.06491017 -0.03202523 0.06226195 0.02193764]\n", + "TF: shape: (768, 768) values: [-0.04141425 -0.06491017 -0.03202523 0.06226195 0.02193764]\n", + "\n", + "bert/encoder/layer_11/attention/self/query/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.0501296 0.11886728 0.2186807 0.08720991 -0.20476632]\n", + "TF: shape: (768,) values: [ 0.0501296 0.11886728 0.2186807 0.08720991 -0.20476632]\n", + "\n", + "bert/encoder/layer_11/attention/self/key/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.02634268 -0.01357682 -0.06076496 0.04210597 0.01783857]\n", + "TF: shape: (768, 768) values: [ 0.02634268 -0.01357682 -0.06076496 0.04210597 0.01783857]\n", + "\n", + "bert/encoder/layer_11/attention/self/key/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.0007798 -0.00065806 -0.00010521 0.00119144 -0.00180091]\n", + "TF: shape: (768,) values: [-0.0007798 -0.00065806 -0.00010521 0.00119144 -0.00180091]\n", + "\n", + "bert/encoder/layer_11/attention/self/value/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.03520973 -0.00678078 -0.02883583 -0.01011515 0.04519828]\n", + "TF: shape: (768, 768) values: [ 0.03520973 -0.00678078 -0.02883583 -0.01011515 0.04519828]\n", + "\n", + "bert/encoder/layer_11/attention/self/value/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.01502306 -0.00530942 0.00023572 0.00205218 -0.00578036]\n", + "TF: shape: (768,) values: [ 0.01502306 -0.00530942 0.00023572 0.00205218 -0.00578036]\n", + "\n", + "bert/encoder/layer_11/attention/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [ 0.02361419 0.03112707 -0.00063031 0.04209773 -0.02434015]\n", + "TF: shape: (768, 768) values: [ 0.02361419 0.03112707 -0.00063031 0.04209773 -0.02434015]\n", + "\n", + "bert/encoder/layer_11/attention/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [ 0.02566087 0.0028438 -0.00475678 0.02149458 -0.01755187]\n", + "TF: shape: (768,) values: [ 0.02566087 0.0028438 -0.00475678 0.02149458 -0.01755187]\n", + "\n", + "bert/encoder/layer_11/attention/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.03134411 0.01207957 -0.04636396 -0.03013046 0.07944281]\n", + "TF: shape: (768,) values: [-0.03134411 0.01207957 -0.04636396 -0.03013046 0.07944281]\n", + "\n", + "bert/encoder/layer_11/attention/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.85203767 0.8020145 0.8554237 0.8150477 0.8441815 ]\n", + "TF: shape: (768,) values: [0.85203767 0.8020145 0.8554237 0.8150477 0.8441815 ]\n", + "\n", + "bert/encoder/layer_11/intermediate/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 3072) values: [ 0.05871898 -0.01124212 0.00206979 -0.04366514 -0.00716808]\n", + "TF: shape: (768, 3072) values: [ 0.05871898 -0.01124212 0.00206979 -0.04366514 -0.00716808]\n", + "\n", + "bert/encoder/layer_11/intermediate/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072,) values: [-0.09762521 -0.06175711 -0.05153917 -0.08580919 -0.09734315]\n", + "TF: shape: (3072,) values: [-0.09762521 -0.06175711 -0.05153917 -0.08580919 -0.09734315]\n", + "\n", + "bert/encoder/layer_11/output/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (3072, 768) values: [-0.022382 0.01073206 -0.01357213 0.02484621 0.01403091]\n", + "TF: shape: (3072, 768) values: [-0.022382 0.01073206 -0.01357213 0.02484621 0.01403091]\n", + "\n", + "bert/encoder/layer_11/output/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.06574099 0.04207807 0.01201084 0.00229322 0.05551811]\n", + "TF: shape: (768,) values: [-0.06574099 0.04207807 0.01201084 0.00229322 0.05551811]\n", + "\n", + "bert/encoder/layer_11/output/LayerNorm/beta\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.00634605 -0.01989403 0.04628465 0.01585056 -0.04256899]\n", + "TF: shape: (768,) values: [-0.00634605 -0.01989403 0.04628465 0.01585056 -0.04256899]\n", + "\n", + "bert/encoder/layer_11/output/LayerNorm/gamma\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [0.6384234 0.6300364 0.66570055 0.6126921 0.63756436]\n", + "TF: shape: (768,) values: [0.6384234 0.6300364 0.66570055 0.6126921 0.63756436]\n", + "\n", + "bert/pooler/dense/kernel\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768, 768) values: [-0.00127425 0.00199868 -0.03863145 -0.00139355 0.00691627]\n", + "TF: shape: (768, 768) values: [-0.00127425 0.00199868 -0.03863145 -0.00139355 0.00691627]\n", + "\n", + "bert/pooler/dense/bias\n", + "|sum(pt_wts - tf_wts)| = 0.0\n", + "PT: shape: (768,) values: [-0.03597581 -0.00389536 0.05181352 0.02224747 -0.00493723]\n", + "TF: shape: (768,) values: [-0.03597581 -0.00389536 0.05181352 0.02224747 -0.00493723]\n", + "\n" + ] + } + ], + "source": [ + "tensors_to_transopse = (\n", + " \"dense.weight\",\n", + " \"attention.self.query\",\n", + " \"attention.self.key\",\n", + " \"attention.self.value\"\n", + ")\n", + "var_map = (\n", + " ('layer.', 'layer_'),\n", + " ('word_embeddings.weight', 'word_embeddings'),\n", + " ('position_embeddings.weight', 'position_embeddings'),\n", + " ('token_type_embeddings.weight', 'token_type_embeddings'),\n", + " ('.', '/'),\n", + " ('LayerNorm/weight', 'LayerNorm/gamma'),\n", + " ('LayerNorm/bias', 'LayerNorm/beta'),\n", + " ('weight', 'kernel')\n", + ")\n", + "\n", + "def to_tf_var_name(name:str):\n", + " for patt, repl in iter(var_map):\n", + " name = name.replace(patt, repl)\n", + " return 'bert/{}'.format(name)\n", + "\n", + "tf_vars = {v.name: session.run(fetches=v) for v in tf.global_variables()}\n", + "pt_vars = {}\n", + "for v, T in pt_model.state_dict().items():\n", + " T = T.detach().numpy()\n", + " if any([x in v for x in tensors_to_transopse]):\n", + " T = T.T\n", + " pt_vars.update({to_tf_var_name(v): T})\n", + "\n", + "for var_name in tf_vars:\n", + " \n", + " pt = pt_vars[var_name.strip(\":0\")]\n", + " tf = tf_vars[var_name]\n", + "\n", + " print(var_name.strip(\":0\"))\n", + " \n", + " # Assert equivalence\n", + " print(\"|sum(pt_wts - tf_wts)| = {}\".format(\n", + " np.abs(np.sum(pt - tf, keepdims=False))\n", + " ))\n", + " assert not np.sum(pt - tf, keepdims=False)\n", + " \n", + " if len(pt.shape) == 2:\n", + " print(\"PT: shape: {0} values: {1}\".format(pt.shape, pt[0, :5]))\n", + " print(\"TF: shape: {0} values: {1}\".format(tf.shape, tf[0, :5]))\n", + " else:\n", + " print(\"PT: shape: {0} values: {1}\".format(pt.shape, pt[:5]))\n", + " print(\"TF: shape: {0} values: {1}\".format(tf.shape, tf[:5]))\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare Layer-12 Projections" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MSE: 2.7155439966009e-05\n", + "PT-values: [-0.876663 -0.41088238 -0.12200808 0.44941 0.19445966]\n", + "TF-values: [-0.8742865 -0.40621698 -0.10585472 0.444904 0.1825743 ]\n" + ] + } + ], + "source": [ + "# Mean Squared Error (MSE) between last projection of each model\n", + "MSE = np.mean((pt_embedding - tf_embedding) ** 2, keepdims=False)\n", + "print(\"MSE: {}\".format(MSE))\n", + "print(\"PT-values: {}\".format(pt_embedding[0, :5]))\n", + "print(\"TF-values: {}\".format(tf_embedding[0, :5]))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nlp", + "language": "python", + "name": "nlp" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 716cc1c4d9c59bcce1cb13ba395c7d7bfb0df6a5 Mon Sep 17 00:00:00 2001 From: chrislarson1 Date: Wed, 19 Jun 2019 23:18:57 -0400 Subject: [PATCH 13/13] added main() for programmatic call to convert pytorch->tf --- .../convert_pytorch_checkpoint_to_tf.py | 55 +++++++++++-------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py index a9bfdaa45c..b8858ee3dc 100644 --- a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py +++ b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py @@ -17,16 +17,18 @@ import os import argparse +import torch import numpy as np import tensorflow as tf -from pytorch_pretrained_bert.modeling import BertConfig, BertModel +from pytorch_pretrained_bert.modeling import BertModel -def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str): +def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str): """ :param model:BertModel Pytorch model instance to be converted - :param ckpt_dir: directory to save Tensorflow model + :param ckpt_dir: Tensorflow model directory + :param model_name: model name :return: Currently supported HF models: @@ -87,35 +89,42 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str): print("{0}{1}initialized".format(tf_name, " " * (60 - len(tf_name)))) saver = tf.train.Saver(tf_vars) - saver.save(session, os.path.join(ckpt_dir, args.pytorch_model_name)) + saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt")) -if __name__ == "__main__": - +def main(raw_args=None): parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_model_dir", - default=None, + parser.add_argument("--model_name", type=str, + required=True, + help="model name e.g. bert-base-uncased") + parser.add_argument("--cache_dir", + type=str, + default=None, required=False, help="Directory containing pytorch model") - parser.add_argument("--pytorch_model_name", - default=None, + parser.add_argument("--pytorch_model_path", type=str, required=True, - help="model name (e.g. bert-base-uncased)") - parser.add_argument("--config_file_path", - default=None, - type=str, - required=True, - help="Path to bert config file") - parser.add_argument("--tf_checkpoint_dir", - default="", + help="/path/to/.bin") + parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model") - args = parser.parse_args() + args = parser.parse_args(raw_args) + + model = BertModel.from_pretrained( + pretrained_model_name_or_path=args.model_name, + state_dict=torch.load(args.pytorch_model_path), + cache_dir=args.cache_dir + ) + + convert_pytorch_checkpoint_to_tf( + model=model, + ckpt_dir=args.tf_cache_dir, + model_name=args.model_name + ) - model = BertModel( - config=BertConfig(args.config_file_path) - ).from_pretrained(args.pytorch_model_name, cache_dir=args.pytorch_model_dir) - convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_checkpoint_dir) + +if __name__ == "__main__": + main()