From c9df1b1d53dcf0cbc126094ac8edc28cca3840b7 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Mon, 8 Feb 2021 15:07:02 +0100 Subject: [PATCH] Model templates (#10072) --- .../modeling_tf_{{cookiecutter.lowercase_modelname}}.py | 6 +++--- .../modeling_{{cookiecutter.lowercase_modelname}}.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py index 7ff9a10eb1..1a78346924 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -161,7 +161,7 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size - self.rsqrt_att_head_size = 1.0 / math.sqrt(self.attention_head_size) + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) self.query = tf.keras.layers.Dense( units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" @@ -201,8 +201,8 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) # attention scores. # (batch size, num_heads, seq_len_q, seq_len_k) attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) - dk = tf.cast(self.rsqrt_att_head_size, dtype=attention_scores.dtype) - attention_scores = tf.multiply(attention_scores, dk) + dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, dk) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TF{{cookiecutter.camelcase_modelname}}Model call() function) diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py index 153da77d2b..79fa28a3fc 100755 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py @@ -593,7 +593,7 @@ class {{cookiecutter.camelcase_modelname}}Encoder(nn.Module): ) -# Copied from transformers.models.bert.modeling_bert.BertPredictionHead with Bert->{{cookiecutter.camelcase_modelname}} +# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->{{cookiecutter.camelcase_modelname}} class {{cookiecutter.camelcase_modelname}}PredictionHeadTransform(nn.Module): def __init__(self, config): super().__init__()