From 15a9d0151907b49ba66aca9e084eee1fb626affa Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Mon, 13 Dec 2021 18:30:46 +0100 Subject: [PATCH] Avoid using tf.tile in embeddings for TF models (#14735) * avoid tf.tile in embeddings * remove more tf.tile in embeddings * clean Co-authored-by: ydshieh --- .../models/albert/modeling_tf_albert.py | 4 +--- .../models/bert/modeling_tf_bert.py | 4 +--- .../models/convbert/modeling_tf_convbert.py | 4 +--- .../models/deberta/modeling_tf_deberta.py | 2 -- .../deberta_v2/modeling_tf_deberta_v2.py | 2 -- .../distilbert/modeling_tf_distilbert.py | 5 +--- .../models/electra/modeling_tf_electra.py | 4 +--- .../models/layoutlm/modeling_tf_layoutlm.py | 24 ++++++++----------- .../longformer/modeling_tf_longformer.py | 4 +--- .../models/lxmert/modeling_tf_lxmert.py | 4 +--- .../mobilebert/modeling_tf_mobilebert.py | 4 +--- .../models/mpnet/modeling_tf_mpnet.py | 4 +--- .../models/rembert/modeling_tf_rembert.py | 4 +--- .../models/roberta/modeling_tf_roberta.py | 4 +--- .../models/roformer/modeling_tf_roformer.py | 3 +-- ...tf_{{cookiecutter.lowercase_modelname}}.py | 4 +--- 16 files changed, 23 insertions(+), 57 deletions(-) diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py index 204b27f9cf..9859caffa4 100644 --- a/src/transformers/models/albert/modeling_tf_albert.py +++ b/src/transformers/models/albert/modeling_tf_albert.py @@ -122,7 +122,6 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): self.embedding_size = config.embedding_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -183,9 +182,8 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): ) position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) - position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) - final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index 25f9feb990..23e4b8e270 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -141,7 +141,6 @@ class TFBertEmbeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -201,9 +200,8 @@ class TFBertEmbeddings(tf.keras.layers.Layer): ) position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) - position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) - final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py index 0888756fba..8a376d2f7f 100644 --- a/src/transformers/models/convbert/modeling_tf_convbert.py +++ b/src/transformers/models/convbert/modeling_tf_convbert.py @@ -75,7 +75,6 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer): self.embedding_size = config.embedding_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -136,9 +135,8 @@ class TFConvBertEmbeddings(tf.keras.layers.Layer): ) position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) - position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) - final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py index c6d2100ead..646280a5fa 100644 --- a/src/transformers/models/deberta/modeling_tf_deberta.py +++ b/src/transformers/models/deberta/modeling_tf_deberta.py @@ -728,7 +728,6 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer): self.max_position_embeddings = config.max_position_embeddings self.position_biased_input = getattr(config, "position_biased_input", True) self.initializer_range = config.initializer_range - self.embeddings_sum = tf.keras.layers.Add() if self.embedding_size != config.hidden_size: self.embed_proj = tf.keras.layers.Dense(config.hidden_size, bias=False) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") @@ -795,7 +794,6 @@ class TFDebertaEmbeddings(tf.keras.layers.Layer): final_embeddings = inputs_embeds if self.position_biased_input: position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) - position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) final_embeddings += position_embeds if self.type_vocab_size > 0: token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py index 31d6d48619..63c3e29735 100644 --- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py @@ -845,7 +845,6 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer): self.max_position_embeddings = config.max_position_embeddings self.position_biased_input = getattr(config, "position_biased_input", True) self.initializer_range = config.initializer_range - self.embeddings_sum = tf.keras.layers.Add() if self.embedding_size != config.hidden_size: self.embed_proj = tf.keras.layers.Dense(config.hidden_size, bias=False) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") @@ -912,7 +911,6 @@ class TFDebertaV2Embeddings(tf.keras.layers.Layer): final_embeddings = inputs_embeds if self.position_biased_input: position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) - position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) final_embeddings += position_embeds if self.type_vocab_size > 0: token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py index 172194d192..d2449f2a3e 100644 --- a/src/transformers/models/distilbert/modeling_tf_distilbert.py +++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py @@ -77,8 +77,6 @@ class TFEmbeddings(tf.keras.layers.Layer): self.dim = config.dim self.initializer_range = config.initializer_range self.max_position_embeddings = config.max_position_embeddings - - self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.dropout) @@ -117,8 +115,7 @@ class TFEmbeddings(tf.keras.layers.Layer): position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0) position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) - position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) - final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds]) + final_embeddings = inputs_embeds + position_embeds final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py index 1fc8c09af5..64053a9110 100644 --- a/src/transformers/models/electra/modeling_tf_electra.py +++ b/src/transformers/models/electra/modeling_tf_electra.py @@ -481,7 +481,6 @@ class TFElectraEmbeddings(tf.keras.layers.Layer): self.embedding_size = config.embedding_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -542,9 +541,8 @@ class TFElectraEmbeddings(tf.keras.layers.Layer): ) position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) - position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) - final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py index 3f73cfb8ac..088475f623 100644 --- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py @@ -68,7 +68,6 @@ class TFLayoutLMEmbeddings(tf.keras.layers.Layer): self.max_position_embeddings = config.max_position_embeddings self.max_2d_position_embeddings = config.max_2d_position_embeddings self.initializer_range = config.initializer_range - self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -168,20 +167,17 @@ class TFLayoutLMEmbeddings(tf.keras.layers.Layer): w_position_embeddings = tf.gather(self.w_position_embeddings, bbox[:, :, 2] - bbox[:, :, 0]) position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) - position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) - final_embeddings = self.embeddings_sum( - inputs=[ - inputs_embeds, - position_embeds, - token_type_embeds, - left_position_embeddings, - upper_position_embeddings, - right_position_embeddings, - lower_position_embeddings, - h_position_embeddings, - w_position_embeddings, - ] + final_embeddings = ( + inputs_embeds + + position_embeds + + token_type_embeds + + left_position_embeddings + + upper_position_embeddings + + right_position_embeddings + + lower_position_embeddings + + h_position_embeddings + + w_position_embeddings ) final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py index 164a1bf526..38018aff27 100644 --- a/src/transformers/models/longformer/modeling_tf_longformer.py +++ b/src/transformers/models/longformer/modeling_tf_longformer.py @@ -482,7 +482,6 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -559,11 +558,10 @@ class TFLongformerEmbeddings(tf.keras.layers.Layer): position_ids = tf.expand_dims( tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0 ) - position_ids = tf.tile(input=position_ids, multiples=(input_shape[0], 1)) position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) - final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py index 3197290099..b33c86ee21 100644 --- a/src/transformers/models/lxmert/modeling_tf_lxmert.py +++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py @@ -188,7 +188,6 @@ class TFLxmertEmbeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -235,9 +234,8 @@ class TFLxmertEmbeddings(tf.keras.layers.Layer): position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0) position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) - position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) - final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py index 30ce8c4728..3d81ced6a1 100644 --- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py @@ -121,7 +121,6 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): self.type_vocab_size = config.type_vocab_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.embeddings_sum = tf.keras.layers.Add() self.embedding_transformation = tf.keras.layers.Dense(config.hidden_size, name="embedding_transformation") # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load @@ -196,9 +195,8 @@ class TFMobileBertEmbeddings(tf.keras.layers.Layer): position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0) position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) - position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) - final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py index 90b5c16ccc..b043fc5705 100644 --- a/src/transformers/models/mpnet/modeling_tf_mpnet.py +++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py @@ -98,7 +98,6 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -155,10 +154,9 @@ class TFMPNetEmbeddings(tf.keras.layers.Layer): position_ids = tf.expand_dims( tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0 ) - position_ids = tf.tile(input=position_ids, multiples=(input_shape[0], 1)) position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) - final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds]) + final_embeddings = inputs_embeds + position_embeds final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py index 9deab21f75..55b8c0c184 100644 --- a/src/transformers/models/rembert/modeling_tf_rembert.py +++ b/src/transformers/models/rembert/modeling_tf_rembert.py @@ -79,7 +79,6 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer): self.input_embedding_size = config.input_embedding_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -138,9 +137,8 @@ class TFRemBertEmbeddings(tf.keras.layers.Layer): ) position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) - position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) - final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py index 107974de10..bb5defef42 100644 --- a/src/transformers/models/roberta/modeling_tf_roberta.py +++ b/src/transformers/models/roberta/modeling_tf_roberta.py @@ -87,7 +87,6 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -164,11 +163,10 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer): position_ids = tf.expand_dims( tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0 ) - position_ids = tf.tile(input=position_ids, multiples=(input_shape[0], 1)) position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) - final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py index 915fc02779..08a7e2bc24 100644 --- a/src/transformers/models/roformer/modeling_tf_roformer.py +++ b/src/transformers/models/roformer/modeling_tf_roformer.py @@ -140,7 +140,6 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer): self.type_vocab_size = config.type_vocab_size self.embedding_size = config.embedding_size self.initializer_range = config.initializer_range - self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -186,7 +185,7 @@ class TFRoFormerEmbeddings(tf.keras.layers.Layer): token_type_ids = tf.fill(dims=input_shape, value=0) token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) - final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, token_type_embeds]) + final_embeddings = inputs_embeds + token_type_embeds final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training) diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py index 754e4d8883..0cf3028194 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -83,7 +83,6 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): self.hidden_size = config.hidden_size self.max_position_embeddings = config.max_position_embeddings self.initializer_range = config.initializer_range - self.embeddings_sum = tf.keras.layers.Add() self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) @@ -142,9 +141,8 @@ class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): ) position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) - position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) - final_embeddings = self.embeddings_sum(inputs=[inputs_embeds, position_embeds, token_type_embeds]) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds final_embeddings = self.LayerNorm(inputs=final_embeddings) final_embeddings = self.dropout(inputs=final_embeddings, training=training)