From 50c6bc4195a3446ede1a94a92c9be50ecf45bc6c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 9 Sep 2019 17:46:01 +0200
Subject: [PATCH] fix tf bert model

---
 pytorch_transformers/configuration_bert.py    |  2 +-
 .../convert_pytorch_checkpoint_to_tf2.py      | 48 ++++++++++---
 pytorch_transformers/modeling_bert.py         | 22 +++---
 pytorch_transformers/modeling_tf_bert.py      | 45 ++++++++----
 pytorch_transformers/modeling_tf_gpt2.py      |  5 ++
 pytorch_transformers/modeling_utils.py        | 70 ++++++++++---------
 6 files changed, 129 insertions(+), 63 deletions(-)

diff --git a/pytorch_transformers/configuration_bert.py b/pytorch_transformers/configuration_bert.py
index 7fff3e5d05..00a22770ac 100644
--- a/pytorch_transformers/configuration_bert.py
+++ b/pytorch_transformers/configuration_bert.py
@@ -58,7 +58,7 @@ class BertConfig(PretrainedConfig):
             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
                 layer in the Transformer encoder.
             hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
             hidden_dropout_prob: The dropout probabilitiy for all fully connected
                 layers in the embeddings, encoder, and pooler.
             attention_probs_dropout_prob: The dropout ratio for the attention
diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf2.py b/pytorch_transformers/convert_pytorch_checkpoint_to_tf2.py
index ab9b6dd06a..03b14d4517 100644
--- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -21,36 +21,62 @@ from __future__ import print_function
 import argparse
 import tensorflow as tf
 
-import pytorch_transformers
+from pytorch_transformers import is_torch_available
 
 from pytorch_transformers import (BertConfig, TFBertForPreTraining, load_bert_pt_weights_in_tf2,
                                   GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2)
 
+if is_torch_available():
+    import torch
+    import numpy as np
+    from pytorch_transformers import BertForPreTraining, GPT2LMHeadModel
+else:
+    BertForPreTraining, GPT2LMHeadModel = None, None
+
+
 import logging
 logging.basicConfig(level=logging.INFO)
 
 MODEL_CLASSES = {
-    'bert': (BertConfig, TFBertForPreTraining, load_bert_pt_weights_in_tf2),
-    'gpt2': (GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2),
+    'bert': (BertConfig, TFBertForPreTraining, load_bert_pt_weights_in_tf2, BertForPreTraining),
+    'gpt2': (GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2LMHeadModel),
 }
 
-def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path):
+def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False):
     if model_type not in MODEL_CLASSES:
         raise ValueError("Unrecognized model type, should be one of {}.".format(list(MODEL_CLASSES.keys())))
 
-    config_class, model_class, loading_fct = MODEL_CLASSES[model_type]
+    config_class, model_class, loading_fct, pt_model_class = MODEL_CLASSES[model_type]
 
     # Initialise TF model
     config = config_class.from_json_file(config_file)
     print("Building TensorFlow model from configuration: {}".format(str(config)))
-    model = model_class(config)
+    tf_model = model_class(config)
 
     # Load weights from tf checkpoint
-    model = loading_fct(model, config, pytorch_checkpoint_path)
+    tf_model = loading_fct(tf_model, config, pytorch_checkpoint_path)
+
+    if compare_with_pt_model:
+        inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+        tf_inputs = tf.constant(inputs_list)
+        tfo = tf_model(tf_inputs, training=False)  # build the network
+
+        pt_model = pt_model_class.from_pretrained(None,
+                                                  config=config,
+                                                  state_dict=torch.load(pytorch_checkpoint_path,
+                                                                        map_location='cpu'))
+        pt_inputs = torch.tensor(inputs_list)
+        with torch.no_grad():
+            pto = pt_model(pt_inputs)
+
+        np_pt = pto[0].detach().numpy()
+        np_tf = tfo[0].numpy()
+        diff = np.amax(np.abs(np_pt - np_tf))
+        print("Max absolute difference between models outputs {}".format(diff))
 
     # Save pytorch-model
     print("Save TensorFlow model to {}".format(tf_dump_path))
-    model.save_weights(tf_dump_path)
+    tf_model.save_weights(tf_dump_path)
 
 
 if __name__ == "__main__":
@@ -77,8 +103,12 @@ if __name__ == "__main__":
                         type = str,
                         required = True,
                         help = "Path to the output Tensorflow dump file.")
+    parser.add_argument("--compare_with_pt_model",
+                        action='store_true',
+                        help = "Compare Tensorflow and PyTorch model predictions.")
     args = parser.parse_args()
     convert_pt_checkpoint_to_tf(args.model_type.lower(),
                                 args.pytorch_checkpoint_path,
                                 args.config_file,
-                                args.tf_dump_path)
+                                args.tf_dump_path,
+                                compare_with_pt_model=args.compare_with_pt_model)
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index c541d18da5..64ea5f947c 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -118,19 +118,24 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
 
 
 def gelu(x):
-    """Implementation of the gelu activation function.
+    """ Original Implementation of the gelu activation function in Google Bert repo when initialy created.
         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
         Also see https://arxiv.org/abs/1606.08415
     """
     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
 
+def gelu_new(x):
+    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 
 def swish(x):
     return x * torch.sigmoid(x)
 
 
-ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new}
 
 
 try:
@@ -195,7 +200,7 @@ class BertSelfAttention(nn.Module):
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
-    def forward(self, hidden_states, attention_mask, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
         mixed_query_layer = self.query(hidden_states)
         mixed_key_layer = self.key(hidden_states)
         mixed_value_layer = self.value(hidden_states)
@@ -207,8 +212,9 @@ class BertSelfAttention(nn.Module):
         # Take the dot product between "query" and "key" to get the raw attention scores.
         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
-        attention_scores = attention_scores + attention_mask
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
         attention_probs = nn.Softmax(dim=-1)(attention_scores)
@@ -275,7 +281,7 @@ class BertAttention(nn.Module):
         self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def forward(self, input_tensor, attention_mask, head_mask=None):
+    def forward(self, input_tensor, attention_mask=None, head_mask=None):
         self_outputs = self.self(input_tensor, attention_mask, head_mask)
         attention_output = self.output(self_outputs[0], input_tensor)
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
@@ -318,7 +324,7 @@ class BertLayer(nn.Module):
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
-    def forward(self, hidden_states, attention_mask, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
         attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
         attention_output = attention_outputs[0]
         intermediate_output = self.intermediate(attention_output)
@@ -334,7 +340,7 @@ class BertEncoder(nn.Module):
         self.output_hidden_states = config.output_hidden_states
         self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
 
-    def forward(self, hidden_states, attention_mask, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
         all_hidden_states = ()
         all_attentions = ()
         for i, layer_module in enumerate(self.layer):
diff --git a/pytorch_transformers/modeling_tf_bert.py b/pytorch_transformers/modeling_tf_bert.py
index 9137c0af9a..ed82f4e8f3 100644
--- a/pytorch_transformers/modeling_tf_bert.py
+++ b/pytorch_transformers/modeling_tf_bert.py
@@ -77,6 +77,7 @@ def load_bert_pt_weights_in_tf2(tf_model, config, pytorch_checkpoint_path):
 
     symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
     weight_value_tuples = []
+    all_pytorch_weights = set(list(state_dict.keys()))
     for symbolic_weight in symbolic_weights:
         name = symbolic_weight.name
         name = name.replace('cls_mlm', 'cls')  # We had to split this layer in two in the TF model to be
@@ -91,7 +92,7 @@ def load_bert_pt_weights_in_tf2(tf_model, config, pytorch_checkpoint_path):
             name[-1] = 'weight'
 
         name = '.'.join(name)
-        assert name in state_dict
+        assert name in state_dict, "{} not found in PyTorch model".format(name)
         array = state_dict[name].numpy()
 
         if transpose:
@@ -106,14 +107,28 @@ def load_bert_pt_weights_in_tf2(tf_model, config, pytorch_checkpoint_path):
         logger.info("Initialize TF weight {}".format(symbolic_weight.name))
 
         weight_value_tuples.append((symbolic_weight, array))
+        all_pytorch_weights.discard(name)
 
     K.batch_set_value(weight_value_tuples)
 
     tfo = tf_model(tf_inputs, training=False)  # Make sure restore ops are run
+
+    logger.info("Weights not loaded: {}".format(all_pytorch_weights))
+
     return tf_model
 
 
 def gelu(x):
+    """ Gaussian Error Linear Unit.
+    Original Implementation of the gelu activation function in Google Bert repo when initialy created.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
+    return x * cdf
+
+def gelu_new(x):
     """Gaussian Error Linear Unit.
     This is a smoother version of the RELU.
     Original paper: https://arxiv.org/abs/1606.08415
@@ -126,14 +141,14 @@ def gelu(x):
         (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
     return x * cdf
 
-
 def swish(x):
     return x * tf.sigmoid(x)
 
 
 ACT2FN = {"gelu": tf.keras.layers.Activation(gelu),
           "relu": tf.keras.activations.relu,
-          "swish": tf.keras.layers.Activation(swish)}
+          "swish": tf.keras.layers.Activation(swish),
+          "gelu_new": tf.keras.layers.Activation(gelu_new)}
 
 
 class TFBertEmbeddings(tf.keras.layers.Layer):
@@ -263,8 +278,10 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
         attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)  # (batch size, num_heads, seq_len_q, seq_len_k)
         dk = tf.cast(tf.shape(key_layer)[-1], tf.float32) # scale attention_scores
         attention_scores = attention_scores / tf.math.sqrt(dk)
-        # Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
-        attention_scores = attention_scores + attention_mask
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
+            attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
         attention_probs = tf.nn.softmax(attention_scores, axis=-1)
@@ -438,31 +455,33 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
 
 
 class TFBertLMPredictionHead(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
+    def __init__(self, config, input_embeddings, **kwargs):
         super(TFBertLMPredictionHead, self).__init__(**kwargs)
         self.vocab_size = config.vocab_size
         self.transform = TFBertPredictionHeadTransform(config, name='transform')
 
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
-        self.decoder = tf.keras.layers.Dense(config.vocab_size, use_bias=False, name='decoder')
+        self.input_embeddings = input_embeddings
 
     def build(self, input_shape):
         self.bias = self.add_weight(shape=(self.vocab_size,),
                                     initializer='zeros',
                                     trainable=True,
                                     name='bias')
+        super(TFBertLMPredictionHead, self).build(input_shape)
 
     def call(self, hidden_states):
         hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states) + self.bias
+        hidden_states = self.input_embeddings(hidden_states, mode="linear")
+        hidden_states = hidden_states + self.bias
         return hidden_states
 
 
 class TFBertMLMHead(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
+    def __init__(self, config, input_embeddings, **kwargs):
         super(TFBertMLMHead, self).__init__(**kwargs)
-        self.predictions = TFBertLMPredictionHead(config, name='predictions')
+        self.predictions = TFBertLMPredictionHead(config, input_embeddings, name='predictions')
 
     def call(self, sequence_output):
         prediction_scores = self.predictions(sequence_output)
@@ -716,12 +735,13 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
 
         self.bert = TFBertMainLayer(config, name='bert')
         self.cls_nsp = TFBertNSPHead(config, name='cls_nsp')
+        self.cls_mlm = TFBertMLMHead(config, self.bert.embeddings, name='cls_mlm')
 
     def call(self, inputs, training=False):
         outputs = self.bert(inputs, training=training)
 
         sequence_output, pooled_output = outputs[:2]
-        prediction_scores = self.bert.embeddings(sequence_output, mode="linear", training=training)
+        prediction_scores = self.cls_mlm(sequence_output, training=training)
         seq_relationship_score = self.cls_nsp(pooled_output)
 
         outputs = (prediction_scores, seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
@@ -757,12 +777,13 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
         super(TFBertForMaskedLM, self).__init__(config, *inputs, **kwargs)
 
         self.bert = TFBertMainLayer(config, name='bert')
+        self.cls_mlm = TFBertMLMHead(config, self.bert.embeddings, name='cls_mlm')
 
     def call(self, inputs, training=False):
         outputs = self.bert(inputs, training=training)
 
         sequence_output = outputs[0]
-        prediction_scores = self.bert.embeddings(sequence_output, mode="linear", training=training)
+        prediction_scores = self.cls_mlm(sequence_output, training=training)
 
         outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
 
diff --git a/pytorch_transformers/modeling_tf_gpt2.py b/pytorch_transformers/modeling_tf_gpt2.py
index 05165ce084..a896ee5a5f 100644
--- a/pytorch_transformers/modeling_tf_gpt2.py
+++ b/pytorch_transformers/modeling_tf_gpt2.py
@@ -100,9 +100,14 @@ def load_gpt2_pt_weights_in_tf2(tf_model, config, pytorch_checkpoint_path):
 
         weight_value_tuples.append((symbolic_weight, array))
 
+        state_dict.pop(name)
+
     K.batch_set_value(weight_value_tuples)
 
     tfo = tf_model(tf_inputs, training=False)  # Make sure restore ops are run
+
+    assert not state_dict, "Weights not loaded: {}".format(list(state_dict.keys()))
+
     return tf_model
 
 
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 9fd7a2c0c2..c316b66bc9 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -222,6 +222,7 @@ class PreTrainedModel(nn.Module):
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                 - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+                - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
 
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
@@ -289,42 +290,45 @@ class PreTrainedModel(nn.Module):
             model_kwargs = kwargs
 
         # Load model
-        if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-            archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
-        elif os.path.isdir(pretrained_model_name_or_path):
-            if from_tf:
-                # Directly load from a TensorFlow checkpoint
-                archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
-            else:
-                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-        else:
-            if from_tf:
-                # Directly load from a TensorFlow checkpoint
-                archive_file = pretrained_model_name_or_path + ".index"
-            else:
-                archive_file = pretrained_model_name_or_path
-        # redirect to the cache, if necessary
-        try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
-        except EnvironmentError as e:
+        if pretrained_model_name_or_path is not None:
             if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained weights.".format(
-                        archive_file))
+                archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
+            elif os.path.isdir(pretrained_model_name_or_path):
+                if from_tf:
+                    # Directly load from a TensorFlow checkpoint
+                    archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
+                else:
+                    archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
             else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find any file "
-                    "associated to this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(cls.pretrained_model_archive_map.keys()),
-                        archive_file))
-            raise e
-        if resolved_archive_file == archive_file:
-            logger.info("loading weights file {}".format(archive_file))
+                if from_tf:
+                    # Directly load from a TensorFlow checkpoint
+                    archive_file = pretrained_model_name_or_path + ".index"
+                else:
+                    archive_file = pretrained_model_name_or_path
+            # redirect to the cache, if necessary
+            try:
+                resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+            except EnvironmentError as e:
+                if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
+                    logger.error(
+                        "Couldn't reach server at '{}' to download pretrained weights.".format(
+                            archive_file))
+                else:
+                    logger.error(
+                        "Model name '{}' was not found in model name list ({}). "
+                        "We assumed '{}' was a path or url but couldn't find any file "
+                        "associated to this path or url.".format(
+                            pretrained_model_name_or_path,
+                            ', '.join(cls.pretrained_model_archive_map.keys()),
+                            archive_file))
+                raise e
+            if resolved_archive_file == archive_file:
+                logger.info("loading weights file {}".format(archive_file))
+            else:
+                logger.info("loading weights file {} from cache at {}".format(
+                    archive_file, resolved_archive_file))
         else:
-            logger.info("loading weights file {} from cache at {}".format(
-                archive_file, resolved_archive_file))
+            resolved_archive_file = None
 
         # Instantiate model.
         model = cls(config, *model_args, **model_kwargs)